diff --git a/.gitattributes b/.gitattributes
index c7d9f3332a950355d5a77d85000f05e6f45435ea..d40a8e3a6db603e92566d8a96524a49ee3496d5f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,3 +32,2593 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_3.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_2.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_4.jsonl filter=lfs diff=lfs merge=lfs -text
+4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_3.jsonl filter=lfs diff=lfs merge=lfs -text
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..adc77154346980452c39829d225f78794c1a0337
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4358876645741479, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04475556276641748}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08601617848273047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003027222218230607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.29226306498620885, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005344776046578116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11084365567127677, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022963454664604076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.04041394987853437, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002132583963194437}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1415225530229299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0034803378779995004}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05146498360167007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014277295107541841}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.08239970522779808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029176133842350504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2827601971319833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00519215142624447}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10611300272902285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002132214334314073}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0815748628057772, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002939098844173709}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2756279858381666, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0049864009961195395}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10445493190549919, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00215267434014802}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4f3a0e480b91cc6ed6c2d7b0dada49379ed1b21
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6458190048668558, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03274318464461181}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.13985160542322855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004374312136397895}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32007070778063884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004962532451445636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.16170102806244036, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036142070180597507}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.0694179465016437, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002921286815952664}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15883761787677303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003480119679954502}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07885240324677927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023523131620574156}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12591553956133228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003854734977633317}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.30061625652088586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004630766957074663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.14742576479874345, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031238502419319854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1283694931084644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003947705918972332}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3021994113995889, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004647780404212457}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1497000837635611, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032201737172114866}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..29a54d354902b4479492dbc9b81db9496b1ead4d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7970016196921748, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04581096081746233}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.18059337363344904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005339355314623877}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3595991748021704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004980869877485108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.19703997895188197, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0042195368281687335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.09688101950971822, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0036313101633365223}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18755499121707667, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00376908816872013}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.10271084479951895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0029196673345892295}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.15982668252407492, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004603772659928646}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3351391580099154, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004585332134390738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1772530360597836, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0035959119026034024}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.16383301190827276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004742860098171715}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.33868792774850437, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004634428210263519}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.18084941775566396, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037099919587234576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9890b843b9a3b155155651d9ec7feabcc3d436ad
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.9098079216712224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049900879662925314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.19212508744736018, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005617174437077427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3703600547581712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004938819064876214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.20522389412159425, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0043615630325957435}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.10484577205978293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003926715523602227}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.19383710059664028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003742953011901514}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.10810375284373198, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003075123686799679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.17083343128882605, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004911086258050898}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.34476103034505623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004533133149997157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.18478247984025434, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003743746721954891}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.17532235438945584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005084911003429309}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3483193020595072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045873162051077825}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.18830268691133512, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003857827801954891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ae0bbea64b33f852ef440376fbb5b51a235770a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 1.052600188336286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06665675598684426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.20108020030540724, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005644545699403245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3818834779332785, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004965307876320074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.21317352472996992, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004401402126159575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.11098039808305214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003927637491564618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.204923577311545, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003948475544315969}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.1144786865204847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003126094675218737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.1772302061036221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0048597986989461165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.35524087803093607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004631433081108199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.19087293738824496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037489593597534975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.183821519223083, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005085692199041701}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.36063124635272004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004673788271599251}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.19652423173229852, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003927577783905062}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7479bdd1d98c6a3977330d398ef2636ef40ff3e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 1.0960931080747824, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06344656593457922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.21325470731107665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005803174587983953}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3950162351216541, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004977496708782323}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.22442306072164447, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004490024810310396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.11715823165872309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004000375023616823}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.2112966644324947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003874637116812206}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.11914106590715341, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003087212545803013}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.1866801247482292, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004986038924327078}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.36529275376302506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045911093135183}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.19921393354497344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037728930475513494}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1931918251374917, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00519492169870331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3699764207220656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004618932928096925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.20449901465956502, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003922872033805495}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aaf71ede44579e8ecb21dd0071109cb3fcabc98b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.020068624571258766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0003544600157797195}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.1322006158687265, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015422644620452844}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.03365291879999083, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005366360315631209}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 7.220799408035192e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 2.9844951292528377e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.000540983067735179, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002518276515393111}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.0001247128207234875, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.298844359373387e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.020051794824344497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00035194324590660044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.13215658314776402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015394013545400771}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.03362856973591861, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005332360539269732}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.012103709829564288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00020447718700316613}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.08586525160141571, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001063601998485439}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.020445712613489434, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00031075004163469113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.0038097370395060908, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0002219610347061402}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..49c204649b527e4e5807fd0e4d63a1f10103d7e2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.43626603059505226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006451581496667008}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.35883293430646573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005004495762257855}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.3481327312179078, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00455441730109508}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.21137487603719227, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005114961059001828}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.16806612977368068, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004004860983552607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.1630534157576409, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003746557325381462}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.3652237379799452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005732161684830544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.30192882739130533, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004466009943504772}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.28990188092888874, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003995413413537247}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.3840495459688683, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005925377180947352}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.31495878380419884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004547250127908277}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.3040864791542594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004078079672821507}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.989261876343655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.40263571084172284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d136988b77fa436990efd8ca66966adf6a1da514
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6362956511318935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00538379300124836}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4834946858936047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004801472074915238}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5078107057324294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004140666670121251}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.37135935928638336, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005066687538961897}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.27760684248102696, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004280106934969935}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.29067052011025934, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004014125890256947}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5265082399354665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005036016101378371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4026738032734296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004454404008264425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.41972809208999595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003877940140365374}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5567637819834128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00512230716108874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4214614745436478, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00441184308930783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.44168829809558013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038261243388568083}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 13.165075565004848, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.27581373067423187}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad50ff8264d664117c0071a0153c486b9cdce54d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6553503707751134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005259442835843647}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4846986680036073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004743810212237689}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5164843715667228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004023809678916401}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3859418988153908, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00508153143197431}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2822822546562199, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004225169829828983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2999645170234003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004030255955877291}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5440132947723192, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004956312605167472}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.40534188613718314, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004461515220119515}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.42918810840416266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003874643346859152}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5740345018130067, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005053174990259695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.42360054475089143, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004428112509567798}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4507171622999017, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003825881679108077}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 13.845066320476562, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.29872110566731597}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7dc5cb361f808ab2f2563c1ccdb8da3f0a325cc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6634050196446688, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004886612003234737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4946481837392698, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004782084733979873}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5276390672600301, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038324896467705604}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.39296194860834516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004819100077639621}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2919119739384423, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004378654594933089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.30837818748213996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003926426979248677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5480449518158801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00473481568788781}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4112826943812621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00457624935912133}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.435888745068994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003799604065357722}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5816122645097909, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004785030630138882}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.43250226005582193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004539900910445077}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.46048000687942087, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00372310461186504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 14.04704336798738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.32871093104279264}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b3360c0fcf49795d00d72c87d623fa141999795
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6713532949429953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004849071608319205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4972585619402549, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00487037629660061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5333758121056072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0039026319169523597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.40052984046218604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004867933712938873}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2959925459372112, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004393215975299111}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.3145492300965214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003964256746686906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5555980201751652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004696669229961967}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4141273952452268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004576109419667718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.44150951115272263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003805310353862312}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5896648146852733, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004767497844568815}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.43581277222608317, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045433886339008465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.46635341830341503, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003728106132230896}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 14.378790163362357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.32798586772214977}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e587b9738f4089eac318754acf9c9f028e8ccb2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.33024351617759024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.051704359395549765}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.009770361346008592, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008287344074198066}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.04975365662487859, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029466525852222055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.012752235193236958, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008894854760023045}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.0019298118317708673, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002966982970755425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.010649081491194475, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001169743419884876}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.0026950461011573386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00033028725537539126}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.008462458079394903, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006841590966581536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.04579276197685639, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027055630960688658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.011182273722871986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007469860690080579}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.008468919439670847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007353173223218566}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.040428841607895255, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024829119630856503}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.0108012920574524, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007827266712588734}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..66616ed1b5a9f1ca9bcd10168a1ebcdbfb26d712
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 6.477679081695303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33565119461770165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.46830027855874495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006300101063417185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4241912009571325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005104576106588849}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.38775536001537486, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004542171141416536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.2406644049185146, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004806392395353992}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.21737008513740708, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00410188320499371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.19596829580492064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0036425337193625837}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.3847305010892318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005484468612669837}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.35286837769549867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004536843256864709}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.3181224584805221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003913082026303714}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.40901456737899233, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005705074341293744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.37079747847290656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004607267630637304}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.3369880890693486, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004020946462748482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f3bd0ec2aba54683f8aa1840910ec254c8305c5
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 10.552449097458222, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5075359348522721}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.5925714099982897, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005886614000972573}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4987525489495299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048306591159496546}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.4908297093515496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00440292254550668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.33677107097184933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004976427241032287}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.28054277473630357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004218111612756725}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.2745170311876481, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003943249150000542}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.4820426153517795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005228080311572792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.41071941896392805, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004422749977888421}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.39923737082902666, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003936921688224572}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5150722423509299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005454009317285427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4334293723998458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004448148760691382}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.424618929381837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004010785107308282}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd5da1f3526a0107335995b33848041204c50fe1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.529959728631091, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.6848885034123435}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6291060498809192, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0053931097015041864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4998057724661308, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004853037618098533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5118142183656486, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0041047394326870785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.3647296831743616, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004961071506430092}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2891088910523139, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004352334361895284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.29393229614602107, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004028779652804252}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5178169597520474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004966872786567351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4154131273981219, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004548123711158703}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4217841035478285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003898458206388385}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5510576711663073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005100868381183197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4367988103969493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004517807079232531}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4462117602532069, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003857887615444402}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7d591ebcef28ab912a8d80b6563a12e23a53566
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.501938799210457, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.28415569927247564}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6469147946550013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0050072292924981275}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4985296869712893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004847713959588524}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.523291959319419, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003969739964628797}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.37974321642248593, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004758307047276704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29279785614455683, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004396790314269892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.3040146901882556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003948804430975799}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5316402821401439, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004701609433025525}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4143927386535714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046130550286461736}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.43115407624884194, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038357397668621392}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5694903223235266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0048218066519585745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.43811279641548045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045849735847942824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.45876891046754215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003791252529951684}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5ab3a72845db4c27a242f675450ad2c626b5aa8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.556796585022564, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.27852899955430843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6610002782815263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005017622845157595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.49345537230504805, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004924666218534147}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5246830517987607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0039796631999095165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.39346059742147355, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004835604517590671}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29186648429984857, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004369750234644563}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.30795637687201854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003917110735112965}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5472030882550708, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004704868980457638}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.41209896039479726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004617070429531718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4348976156448126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037848622247426116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.58299611946106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004785834392496742}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4337099508428588, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045686487281792094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4603023464895643, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037008836133842087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f77e57d0921c7440d5c949873821e4631b552a7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.06706070850923308, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001641705982706511}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.2309061560559356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00584465805287697}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.07242669297423687, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001681684378008165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.011010788571996902, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005458956739597873}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.08004790916119307, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003344495650227847}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.018236794772880795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008615346385667937}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.06355399391256518, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015568088892838425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.21801502942955459, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0054970304208384465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.0672739987848506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014524145543017358}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.06145516925393397, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015616424129383}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.20027279793496214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004938295524532196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.06387955329019673, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014285492353000519}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.24984031541663096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.012924324247896772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..69d7bf81df8380e9c21ca3e8b5c9c2f909a31d1b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.3181030735546715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005373925400256652}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.3251558121181215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00531664302610407}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.2792227778567483, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004263305560309992}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.12418420030878015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0038826294171554997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.1312949228236328, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038357971846482808}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.10849910513809453, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003156980188228955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.2631322967918368, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004569485790819501}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.2743254974329521, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004722899827718838}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.23040067712932039, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0035540519679111504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.278813174699537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0048188144324256485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.28403936265612606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00469510820335236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.24297581585449876, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00371995328995244}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 3.705116187171442, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2792269501223854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7323d42d3c4472dd9da1d7567bce93580c846d60
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.600466363774825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005282943419136165}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.4910404404053262, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048916435292608714}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.494991635014208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0040212580043449625}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.33593288364017954, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004653747053149475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2768267729211498, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004295843993088077}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.27481294702147024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0037598053969334754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.49001524253465367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004759003697565942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4057892261642288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004509538826924196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4047413734025506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0036897985733284535}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5207320474983632, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004942597449806194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.42593556430915974, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00447978031567887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.42756211299071967, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003668514458622858}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 12.258816180492442, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33509170789115567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2077723e970162877c5dc51f4079a46fef02da3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6396291098157352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005012372082127943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.49296712645136953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047855206125547054}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5154529984750105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003810967011831987}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.36107447936375614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004688596438977035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2798463176070886, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00422023661159036}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.2891214170531268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003789802955066307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5210394733474353, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00467589207565221}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.40605775552470086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004478812348360755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4211027205368336, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003666346201802303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5551710306111246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004806611169149116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4276039412948363, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004451737455618187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4457300868732614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003613131206631707}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 12.641415106662606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2559752456319122}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e86d0292cc6db399e0772de68e5ef90ae96bead
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6533901529130552, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004750298489587612}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.4978245547862729, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0049638184483227245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5254658165967703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003834943975498437}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3739710377631859, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004650812086245668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.28782083216563187, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004469320375742115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.29898924205288324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003896580955933994}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5341947075791592, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00448509853552663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.41109794264396293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004669221276796458}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4305179535570766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037087908384292726}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5678536307532968, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004614146022878327}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4313369622755667, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004611186352566589}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.45413143397226563, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003621738095592773}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 12.51566452371121, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18662930850276757}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4833e01682002a85d8ae01d65c3dcb5caf68c065
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6634432758822567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0048481143494641795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.49724281083508204, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004869426527869909}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.529417668132056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003826308214918395}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3899421272006655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004781244315624528}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2916271410817384, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004359700641177092}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.30703573949811763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038660421241393456}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5444083011792844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004568064871523696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4128466016133311, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004599967068818717}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4357909154278315, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037014554665568476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5776387605813129, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004683314725587799}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.43239954634861594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004546461380714694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.45908015170176464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036338440583547262}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 13.024109481625127, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.37467296505230513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..14f5946fdac137e64bcc7a77fb752207c9f1c97c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.026407959502331903, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00038868194633258883}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.22356670586627483, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002332253284549633}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.04588118282553198, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006171605320982103}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.0029537913433360027, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 8.371907248046422e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.028781138144721528, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009026844718533621}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.0052053675573166145, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00014522168539481952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.026294482230459135, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000384669915057379}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.22280019098723994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023265096508733036}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.04568781459679956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006108800927922479}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.022403360020811003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0003171490032197158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.19384325682164458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020160161936303308}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.039001539304585, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005037468676529433}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.03302516501887599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.006176680874969092}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5d26c156bc82672a84cf176493229fe9afbe75b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.44343574580605305, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005836165049538884}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.41167731843976096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005244681170881314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.3810155399369177, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004510941559773663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.20955577077441034, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0045867433415744925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.19673389590965332, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004066132879496505}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.1786263826302864, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003609690020384216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.3659912580345323, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005168110662696892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.34168226812319036, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004631056048180813}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.31302032604227675, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003914907165679426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.3879497324594378, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005327568561873762}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.3585198739927853, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004665725786249284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.3308490189257667, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003986497508848012}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 5.92155994361654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.4368162501010792}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4f73130b88c348251a8967b16c5be1790f11a07
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6148890739123269, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005151170963850425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.48751527677115314, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004910148217088944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.50099732123994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004071389975898556}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3368551697989736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004690360563606446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2679764741145998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004297248080849364}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2722552210596285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003869080854816975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.4990454995661043, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004719356768833076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.3991993603426751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004509330239847587}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.40620639130610015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003732849234734221}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5311216931077634, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004866432973979014}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4194624660588917, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004505522174144945}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4297560700484828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037232038393596916}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 11.653311716196534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1705735396868982}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..76effe4d3357b9873ef9912abcab0cf2ad1c58f0
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6521291829493591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004862083336775399}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4925886684392769, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048436222050010115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5212796553891711, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038836053434542343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.37085082947442344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0047222546332911465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.28212557152458906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0043025347256123855}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2948667808479424, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003866435870905894}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5348811228515187, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004618628192727038}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4069011016263415, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00451821222866322}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.427692905935477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037140323185331395}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5673764064869086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004712552384750834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4266064089546778, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004471064283067558}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.45113220970476947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003652549529693666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.507248519002058, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33581711795849156}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e7d8781045968890bc3e46b328071e168445eac
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6640191119912606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0046287852254404665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.48722206954684877, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004886820654506614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5250242061127607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038183444409012057}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3809187248804994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004628751830319166}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2823123595032247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0043838158850553486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2999240110444844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038838867088996087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5440068657045747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0043952553052520405}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4027090518140514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004609214493770483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.430984226701193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0036987359826037795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5781874081685048, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00449591006527644}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.422950486144644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004568519660925329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4551521831250583, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003631473140613488}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.396312569329053, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3293469366739629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..50dad1c985b0f439cd4d39edf127703293f8faaf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6664615175587149, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004810736210585854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4888785939143449, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050250180805546055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.523785247748256, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00396852764381019}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.38655727666760126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004811990065987808}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2857193152364473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004475062913060129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.30152821306111477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003959284720918688}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.548696399374843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004627017088614486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4056777694315134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004679087627002612}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.43158846013751617, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003813577934303311}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5792867412593304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004732363307721942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.42303066939122486, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004651572108090147}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4523923440770611, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037402755921038884}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.891215677061275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33496311481577984}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bbc72f2b7c28f44dac00fb5b0ae367d22172380
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.16545410465514598, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022997761900446748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.28561683985985054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033805688000285673}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1943147054100485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002290302577130692}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03891245662630143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008607597452070799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07031350658097817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016328568998679233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.046143499270621315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009592242142563405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1105354395920507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015190328037893866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.19933561661994773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025398698192885018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.13129771132175647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015070177742292337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.15358147119074234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00214171849980464}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2656395423722148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003155332061428429}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.18042769676325615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002125927765273512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.5181046478114517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09250562496871702}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed3aea7e02b7d7e9d9464b301b19de46a1dd7a39
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.19118099408569436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022318050478954713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.30065602094383553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003003943741161264}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.21191127536224905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019609532949790213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.043480246462308615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010898262094121871}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07030911937863445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016278311940230515}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04782532792275138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009895310959648507}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.132695679844194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001577353316550692}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.21276743865231712, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002279648108033155}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1469428089542953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013001589353820034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.17835239766630012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002087403245048929}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.28062460803225625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002812309149448641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19748390376103794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001814999070191041}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.2923870243499453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04710829598595837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b261f7100f8a9e0cf052b657818c7c003e7a342
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.22004296063312756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027587193644827523}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.3004661215661355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029546541029262952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.22370918766321005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002010231202549798}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.05536118871732041, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014858900012090124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07390397982561925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016332246699294153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.054327487227033164, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010981648478134627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.15663926213726143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021407976310347816}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.21401909770058278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022276760034737396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.15758363100687361, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014304910029667866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.20481968410585505, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026160792844735293}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.27917387789267256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002758702443143936}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.20778987647499506, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018852458382730504}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.711904811057529, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09526160140046504}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fee1a1836b660dac2b65fd7684f9a972fe20aa1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.20203951914040996, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033197172999644433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.24139296790371112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003407257826021676}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.18732343321931774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024323481657191164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.05217979169112856, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016808629240166956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.060144137552866045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015840842099530614}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04629662621989341, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011281742136129457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.14790066047047568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002654555605999541}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.1738019326166572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025360210562127504}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1339808748098793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001742036427781668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.1879613854814907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031385903112030257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2239521824907146, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031803147008683547}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1737001196246117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00226610556612505}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.873796046005488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13673291688563485}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..77699e68eca880a83101baee1658125e8ca76473
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.0735717278931429, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028911131224153295}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.07553229981002776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002757787802272632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.060947459186220604, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002112636651398563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.020903657542024313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014483735027008208}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.019261648377261246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010410235853285292}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.015588317769795648, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007937674501197837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.05657476965321192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002371018879045927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.05566946485627606, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020412553156093018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.04515352188180819, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015731768240294375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.06864642701940102, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002737856522478581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.06985653918253502, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002550011308620696}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05648029062299134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019666947501841563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.2200180577672771, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.027482870742044186}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ead843117ae70f8261699dd018231cd29d707e87
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.012343062142861565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013545796255510047}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.012305611615879857, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012537625370804336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.010077141412512394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009974698986143257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.0033110648238113935, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005010803065960274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.003581989670869063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005441626132486554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0028348539794116954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003871006036728002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.009684293482664644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011205772153478955}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.009463327618981224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009853253825312287}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.007688930494688018, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007722547402435018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.011416410255274385, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012625531153532816}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.011395661121003155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001165756457145875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.009289497933603756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009175552178181104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.582389521440466e-10, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.0336511902248086e-09}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fc999a0e6a31c469f1e80cada7eaccdab4c2e92
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.049207080285889174, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001218170848562994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.07381385524331617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018227019727620334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.05451479621157953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012807036115901312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0051941806701366125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00035925475672845153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.00839259424814296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005661600346822327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.005939258116110201, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00038468302190640825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.04351987471606278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009955603897709461}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.06622021616649243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015711536395290884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.04839456994889072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010565437353728235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.0459909349232918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011148896123208223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.06933867001801945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016979262230734408}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.051006434420687016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00117710177594407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.4202315524967184, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07509386796041675}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..92a21614a2fe9899e3acca37c785443e38f70ec5
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.12085269058219979, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018800465788083838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1212329663199883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00187706234222241}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.10539766483016742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014004693299149832}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.009745493152667832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007924698057957289}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.010021780512207765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006924096781473867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.008130044029674053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005061494758793983}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.09639222192782304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00149350986619975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.09656042627392813, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014661442579061582}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.08335270418305636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010371033295063308}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.11579018628895178, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001768658177302644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.11633579095629488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017821423582159675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.1010214292579844, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013158243234685884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.6101972105932388, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06528883196121242}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bfc4129b330c158bc7aaac8d13616700692940e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.22250413866904517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038252703699176048}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.19070576171594192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029483914941301126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.1683928422031243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022763664965003553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0621189927043659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023125503328516586}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.04614677867390645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014257058600775931}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.040944778973814884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001133602887491379}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.1765987395394153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032959155772800614}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.14723158465597147, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022781392441315216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.12964889700558796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017255313237463151}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.20961600752743212, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036729560713837975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.1785620404996782, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027700352932856034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.15762456715786408, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002130633426308049}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.5697392740712903, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09726441182310538}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f66e9840f79db76b567984a028b957c0699f9fe3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.23920789042272972, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004451923794477901}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.16718906497079217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030207461399335495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15814533627204788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024790082791499913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.07309736843132873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026804185613156914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.043829507427460696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013802638969192078}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.042100869824327695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011891039526604893}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.19334436627613655, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003819886080978273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.12983200684113314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002334520345992616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.12346136179216806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019112011754057825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.22602981564601687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004287782432287635}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.15608600439681367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002829819880008212}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14794587354702365, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002325660472000532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.149834523005355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07337967845410115}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cda40e0846825b3f6ba2e58cb6c8f800fba7187
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.08353070569399211, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035352228648524034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.05030529429455797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002119662209344908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.04952738568586151, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019248164005399725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.02788475548850126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001984709851822524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.01357745926029204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008062711899462007}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.013962256327531126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008076774569833121}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06988961888688942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003098125936762339}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.04011808095891787, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016990965796509772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.03967195287913463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015485058524769843}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.07917805251121919, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003396133321736807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.04716680249732015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001999098077515287}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.0464257712662957, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018095491536731455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.034389491270844154, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00795211648185989}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc2f9d515be0453e282934b92b4140a54ede6e58
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.013359898000632503, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001546173211155727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.00743087996116675, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008730744758598823}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.007524276436169901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008101397059229119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.004979282809533886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008580275451638574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.0021429531558444645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003530144211538606}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.002347560172350861, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003520939095930996}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.011124009635183039, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013504382029806005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.005991212365068976, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007027748270460832}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.006087946245577977, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006667486081828124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.012927390131069527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001511374194406188}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.007111949315448997, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.000841252461914259}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.0072146284393324175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007801116029630629}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 8.014966822272051e-19, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0374532265072001e-16}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b415fdc773a6b3445bda0daf64de67ce7f790d94
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.10084728088346058, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016704126782767605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.11626998129546122, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0017095825126744002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.09861978093907281, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013312382950616123}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.006952750862086854, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003742497583440925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.00825312679525918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005060847331287482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.006774058456222539, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003619264283480028}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.09346515420059896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014766983553922265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.10921956551040214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015306854518442438}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.09204815585560853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011610452838456383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.08684132289392488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014723630029859183}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.10119415272052232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015325967355127004}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.0850753438963473, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011505739554051005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.24069952714187115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04999681194672116}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcd7dc39f65a3c8f19ee13eef27debce127c16fb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.12570484541604557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019387862247634239}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.123630356642246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018183757555234191}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.10847730157557842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001363680486425725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.01104088364395252, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008639559463958727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.009877120720020166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006063789166641115}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.008501169030695347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004987236739387054}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10137716059142088, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016064276788596392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.09917736982376567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001424294120823274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.08658591358168741, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010363082761354987}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.12102429972223623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018621970559901282}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.11902518130446503, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017340172425578562}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.10436489995945487, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012929050919925859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.6572484010354098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06678803371648423}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab7f1b4463290420c815d96bcd2742cf1ff5a28b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.2117483237424628, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003673007092156956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.1807723247472404, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028119380867935155}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.15978364007037885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002146039638632564}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.05045804846177444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002077814678823861}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.038071387292995425, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012888149094590308}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.033815746580063144, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010388210907900453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.16822046314553674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00309089860213003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.1411027078757953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002198021261123935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12422850518706006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016333916321201456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.2001613012527362, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035246041252274714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.16951618612440267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002612196179301279}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1501340500446388, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020038342663596972}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.1040082038162398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07459272356146383}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e396704bd972820cd80076b3927b14939868fd2e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.21604655852321505, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004237013318778521}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.15574838747308595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00290352384129497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1454209509941261, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023687960335153093}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.061614289889954106, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024851490831325055}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.037776751473578556, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013168249511316943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03567535556584864, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011201666793845067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.17476206179888693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003650963432920612}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.12177092267587064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022748704863199223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.11395609046047189, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018362494195700422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.203925907737883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004055385996449827}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.1463238349053685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027458312154874562}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.13649183555298453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022246127803878944}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.8684497943291858, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08209544953446724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b6c4ca99d5556557e53280dbf5a5d2dd2715075
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.07275705303203882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032507746107726535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.046114009178668744, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020790831962699103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.04440729385734818, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018127470127528602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.022744080016362914, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001828829152352875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.011929912900514794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009066681223803078}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.011240567915400753, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007301610677896298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.06041777404457702, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002845907268084594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.03649435423952065, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016477572016587662}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.035245517816726114, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014338608453415918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.06910110864393543, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031267048764333605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.043190618850823925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019524580787881156}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04166885497985569, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017010247269226748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.023154739388285894, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.005577121563550948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc69ae25a15b2914f4917298e8ab061b7606d28c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.010296457827498611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012918860319172164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.006758532981021845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008379711362413936}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.006737087899764484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007910409443146855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0036868286820582992, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007834130189216914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0016965282718283603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00031792662046301257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0018352478253792217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003166724369022159}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.008409961437487717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011155876274861894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.005373734560444225, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006740932130105292}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.005318064241692839, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000633253894423773}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.009930592441167776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012607226775993969}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.006454884476813964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008097834237119673}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.006429830165009096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007604504484812541}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.901663426428568e-18, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 7.736570961567845e-16}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..56fbf720422c9d16d4a4c06171aa5712e1f5b655
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.122915905754568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027293510115185423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1767818917839268, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033841479310980638}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.12988512607981967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024439343174664525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.026806616635118036, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008667140586963389}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04199675102883295, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014222033630357748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03000716115226249, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009354733970545372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.0938740707460975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002177005796825213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1374323483251746, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00266526479478189}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.09879731353636668, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001799103726263371}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.11485464277414363, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002596139008897079}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.16449395724493923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031654167638919778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.12094018141973355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002284523760450248}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.278917009551764, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09140957868815724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa54d70a3b1c8711f431a6795303f90b7689b82c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2321033086693269, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003444990523939608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.21218959051209052, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028780222957239345}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18879633772816867, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002216973861149929}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05986007346081548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021311948977268975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.048915032760613236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014864819960528294}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04395688090556475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012580561387008646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.18107984090374726, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028960619334779066}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.162566462675585, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002220609266556534}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14456965085181128, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017115993134306947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2192803166896013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033059632616290542}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.19959677205987747, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002694200019026103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1776723900847922, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020872816698884362}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.707464673902901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1122035248088363}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c83727a95ec544892dddf42b7ae95eef027887a4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.3147030364809968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037689305738243476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.25097433153568405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028925947424101703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.23558432539564403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021996425715538554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.09338720402263002, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002368756352904399}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06942686553346078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016202813479301297}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06557325455887651, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013735424734152642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.244935161158131, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003198495702253031}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1911576784626261, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002276055119508395}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17976921379978278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001722115310366516}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2958662208042827, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003600934016250172}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23572659332760645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002741901482334439}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22099937310878076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002075646266476727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.7904798767943335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11122294896753164}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b185beb60fe2d8b6c45612814fffe0352db092d3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2754936136179268, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004157379347035767}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.20528413095627832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003149288824376793}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19838663921126778, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002621759128746396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08256853694746276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024364087557486665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05727334409426975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015675844642289158}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.055968236855402816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014034751658766104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.21625894747073696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034653747339702624}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15717871923447801, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00246051792291125}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15243563853997083, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020406692721428466}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2591405389486435, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003962509253114959}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.19231398442318168, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029662256128315167}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18589041350629654, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024658633555113495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.8632324313744557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09299178906804655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..29a054034522f0ab177ebe924a42cb24295be3f8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09597320792153687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003523744199921863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.06743171611131621, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025672971207144984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06581475266895565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002308785748398076}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03073553158490422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018205157024659981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02032186815564804, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011741990653970874}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.019691124849550927, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001032704031652877}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07767834454496185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002968257043282438}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05327838136711029, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020669018184854763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05209993435974213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018508808521006768}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.09035808108893525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033673742748827878}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06286918923105832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002407780221261254}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06145306117564354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021697142437582524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.08259030504562871, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0071288759428394894}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1d51f6c467c7e657453897c3678436979859c78
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.015768279475199848, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015994817709509021}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.010208407683374372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011101868759327263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.00991335048881209, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009630926105753386}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.005050282757985984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007965820847066382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003233565984990156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005539676407167323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0029524028339129186, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00041005985526957695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.013277476442981736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013881298634661826}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.008616852733661218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009635704353861369}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008248037341762705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000807006653030911}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.014923565167243124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015288899364523146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.009697966584991889, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001073756967121187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009376779842315807, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009193911532830411}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.0184106792465108e-15, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.78802158675685e-13}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd72bbbae9b72ecc564b8b0d7059ebf95659659d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.09716914266381921, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001588873777767349}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.113047966597192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018138880339988268}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.09393150748763507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013913781802491765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.00789406798092684, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000419736727273889}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.010369466926148594, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006220741626663643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.00810320245121132, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004246779834740568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08623270741496217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013189384242570964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.10103979947244672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001525640240790577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08344187952213442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001134658015336061}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.09230820827944372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015118723923496866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1069999135512711, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017087689994245597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.08890494751613388, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013055354696445939}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.4520613110187872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04670340488015433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff0c8c68c11a2e298112dbedeadb31869769c8bf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.1184008680654741, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001564826825421052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.11934831803746493, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0016574926168922098}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.10497305468531427, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001255055086560988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0065474359357168685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00041243581464096575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.007144064143658665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00047460386541255963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0060518186828094475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003723118068721728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09478143647519513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001219626359065995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.0956209796427602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012884715653876924}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0835136406112498, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009308872181256089}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11382913352428527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014806869787433784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.11499167572068492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001581922066092159}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10097338753997802, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011828254832324744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.5155792843718754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04989759827988764}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6169eb25fe889f2e612559972dadb21b4dde7582
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.15273932782795627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022156125742117706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.19651328239605736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002663780636112302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1507885545235868, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001834000670129471}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.02517008324499953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009798089194818487}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.03413193930670911, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001221169425838899}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.024795023268022354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007930977642022854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.11730847696393268, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016795313897096911}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.15294841448894894, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020599858142528278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.11559317558886968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013085530731690627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.14392075150794262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020782027930187456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.18487493068333727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002490594176024061}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.14189539617563585, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001709091043439753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.5716024718768677, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06669078339037757}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..59b7b3ad94a12fb7d813660e131a32bbd4210ad1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.13391084968723968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028933184881919445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.15711899898252504, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029594045076376902}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.12179800960613144, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021714004036956338}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.029181772262838104, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014231487802429015}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.032454024258731846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012888650580723702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.02465336911318367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008917747894159659}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.1045676504106663, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002295339185386772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.12418186583034126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002364123273303685}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09436711261591924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016117593482645508}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.1256412476000786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027136444937049084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.14768417753168112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027908694567627588}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.1142421768175613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00203307593772788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.6448018903331314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08986896028234234}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1626914f1998895e7680485582b7e7fd11424f0e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.03781390643712872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019725007865177713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.04248222303303341, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020576616744248126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.032778186929640955, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015160232311598606}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.008727821675270711, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008860783473493923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.009476342855751762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008552628886109314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.00675165993793729, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005128558386297882}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.030251516700860112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016420771541506996}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.03390501778846023, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016639430097277806}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.02576037746303003, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011788924925310038}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.03554070882434891, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018653666178719184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.0395625119959948, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019080538751288382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.03062804358044647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014159235675085848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.03793835793809636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.004944367898593229}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c527a3b6da7e1176c9c8cd1ac7b08d71a66e406
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.003663087080839873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006889075640243746}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.004094057107563647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006895204034868401}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0028680994456484797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004506085285145695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0010185261931717935, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00038254237678243645}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0009427307765006302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00024016077148222763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0006182627044826944, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001367325028734569}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0029725997727319074, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005844640815613739}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.003399810296798685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005797265646639166}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0023407410922934428, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00036664012211638547}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.0035444907573141614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006683251187877269}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.0039682862002560155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006700453965634391}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.002768076356007906, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00043244630871936497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 7.352480577371536e-22, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.69354618200267e-19}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..49f11c72fefcae52ac35a5fdb67aba6efc0a1dff
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..249d31857f5a96c5a2c7defb45a67f4a7757049f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.331, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203931}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.326, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ad7a97101a4f1df94e130ac8a592c7d565450fc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.347, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01506047203170662}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.346, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b74ea05f09650b763b4e7e9856021a625daf54cd
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.337, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653596}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.348, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01507060460376841}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d17df4d3eaa32ea7409c1b74884ffc0303d9acd5
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.344, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015029633724408948}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.351, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..33699d66bdb61760f438dc1fcbd24c4676c4ba94
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.346, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015050266127564433}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.354, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015129868238451772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2af184ffca04764a2845414d80433ac3e1603525
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795027}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.316, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01470919305605714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b768bc59a930bc5e8afaefab100fc5d665e82884
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dabf138eee7733aff1e75d498d10fd13abf1029
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.362, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015204840912919501}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.358, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015167928865407557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..babef901a1a7e0eea11a2048c69e72172a5ab08f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.351, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015100563798316405}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.348, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015070604603768408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b06bdd1ed0f174cdecee4a204b8f4135db87a3bf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.331, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203933}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ce37ccb88cdce008c5c7cfdcf3e308dcc7a9e45
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.348, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01507060460376841}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.347, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01506047203170662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..537c2a1f8db9fc782cce73fb00973feaa6e443eb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01491084616422987}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c2de58ae28dcf75fc3d4ff2e1cadda287101b06
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e72daa8666c3d26584e231bc2efd7df2613f1b6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.364, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01522286884052202}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.348, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015070604603768408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..902b3356ba4e9c569cbc792c7545d841566b8768
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.356, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015149042659306626}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.358, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015167928865407559}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2ed9659f58c2ca107f466e2040a6931789dcafc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.353, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01512017260548369}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.351, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6110ed8d1a67e8fc7bbcff9d37507d323b3f365
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.349, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015080663991563098}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224473}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c52a29de53d3d245c37a9cf6f9ee99fb3df1b830
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229857}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01487687202745673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..25026ab4415fc93fe80efe3ce12d46bbd37fa43e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.321, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a85e2dc596a7bc97f55d53438da9d85c8b991c3f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348637}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d71735e8e6169cfe4ed34ea0fa4bf7073098ce7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095524}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dcf2e2a58fa87e81bad1bee5677a37e65a26184
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928369}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811492}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bd3fc8764a263cd65c8969c03d1feb2f2c82d65
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795023}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.343, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7a93d35ea02e77735b826e6494207f8805c4fc7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224479}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121731}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed7daefa4e4dd81f9c5f04bafc208637be2f539b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..443d243b3dc92b09252783af76f6dd6517c2b0b4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.37, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01527525231651936}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.359, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015177264224798594}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb9e85c34eba87e6e57e977e0c56a75c93f8891b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.364, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015222868840522019}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015316971293620996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..67f3846413139219e19bf393d868d54c2fd90349
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.36, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015186527932040117}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.366, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015240612726405756}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f05f6017d6eadec7d626755d53652e687b7094d4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.346, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01505026612756444}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015060472031706618}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0628af7d0bf4985241e790c52065f4321f4ffb3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01493311749093258}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.352, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015110404505648666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2705cf5c12388fa00820b0ff9faf723d083adf17
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363935}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.332, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811492}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..05d72290542734a59e7427c8d755b0295421f9a6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.351, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015100563798316407}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01497675877162034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3c6c4931df7d22eb32380efc7099b2cc81c37a4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.332, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811492}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e1f06de35566cd3c26050c235018623d292eab3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.326, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541031}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01484221315341124}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae4a50bd52ff0a55bf5e64b4a8f355934071bd5e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.332, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811494}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411239}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf6b8d6d07be01ae4ec21de4d26db2dffd866221
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732958}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.349, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015080663991563097}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dacdd502e0c6b3a07db0534a7008d8eb711c6e2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c95b5389f555e87c6dfd950d1e4c892c5ca2e00
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.318, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.319, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473479}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b1eda0f743aef2b210357889c66aaa64cf54f8e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.308, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014606483127342761}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.309, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206491}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8246f16213dd393c3ce6cd4eb7ea79668f40dd6b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014498627873361427}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014632638658632907}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..37827b7928a919a2d7eac727b563086afd2c4bae
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.304, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014553205687950418}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.319, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473491}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c6a0f1493b3b2215bc5489e702e3dafaf62f904
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121731}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f20adce7e199b4e2d83c6030db569d0b36666716
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..38f8039173c174a1735d21d65db1aa927a0897d1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01479492784334863}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411239}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ce6652ed491a6a231eb2e6d899493a86e9f58d3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653595}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653595}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..975a18b2982b18a7829cc349d677057d91ab01fb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.312, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01465847437050901}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411244}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d665878f40f40459379ded791a7e6289ba2954e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880224}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934644}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b46ec16c4ada92ed562a06c5985bd3448fcda44f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732963}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.318, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a15b49f02b6a068ab03f9e5e027b6f358ccf17aa
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.316, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057128}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.316, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e9d972965e18a11ec0bf302186c3d9f13f1c5a2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541035}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cec471404309cdcf3ad0a657c9e77d374c6d7b9a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.317, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880213}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996681}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe11ac2c73576839f0104fff86b620075e47d00e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541031}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456734}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fcde5ac06833e02015ace3046dbad9db7b96bfb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.309, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014619600977206484}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.304, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014553205687950455}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..29033b8173505ed043ff6d8b0d85857179422b8a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932579}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795021}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6be51fb6b2de4d1a8fc335a07bd35abf11384ee
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f262e3813c900611c308800c66468586563c30ef
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095522}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411239}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..52e16266f7f8742d1c2e3b9cc4a2106d941774e3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738863}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795023}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b516d994e29ef13557428dde4c189210abbb0ea
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996685}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b3248828d38d729b8df29536bbd4b89b4fb5a6a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.318, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311903}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934642}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ccf714cec704585a692e243f316bb24e78f6fcf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932879}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3308333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013588208070708993}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a87250673abb0c8abd9fe5b7179ee2d4d9994fed
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013605417345710528}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..37423538e292810cead7216e2bd457ac0bc237f4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013622434813136778}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..625b29cebc89055d92ef46a26d280110efca6b56
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01363926119093288}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3433333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01371263383046586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..730aa4873928bb7922e2fc54e8ee9bfa2f96fc7b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3425, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013704669762934728}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.35333333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013804572162314925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..04047844129c5c0e9d78b641be8edb758f2fddf8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3491666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013767075395077252}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3466666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013744022550571946}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..608eaed3eaa49885b057051c02dec4c871d564d2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821476}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e6a56acb7f425cceb2886d620337828020b91b4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..53225d26016831a9e1fe39b3037d3a2f39bbfd0d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618266}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.34, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013680495725767797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f5f8c7e1404c603d8fa03fe73c732cf89d5f936
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3308333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013588208070708995}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..43531abf5b4c8a85e4f4ca830ad14727d0ceaf55
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3308333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013588208070708999}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31833333333333336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013452948996996296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..af976330d88bcb7e2436975620bf3881152177a0
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3175, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013443538681348052}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01346230971200513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..44108f584c4324b91e8fecf7a613af2da393b30a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618265}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..daf25fcf0f23c29566150c470b09fa965d1f217b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..04d1ea664bb601decc3e1eb6b08aa5a57e95061d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3175, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01344353868134805}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013415009084004862}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bae15739a32f42d85fd5f89ca0134b918cde1ad9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225603}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1c54012f74c02f7d377afd8b116252182765416
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01345294899699631}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c75c4fd412f21ad4b365bfb9b2e5e81140b24885
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31333333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013395739415639082}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3f62dc5ce87abd8d221616c073adaf7fee317a1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3275, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251954}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013471620929769144}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1852ec2e5bdfcb764cfea507a1d9738965fa19d7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225606}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..20f2de8cfbd1fe17eba6207c3b9fe7e00de006f8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013508372867300215}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351021}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..04200f0878ea120ca3244d8d2fc16f4c83b1a7ba
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529017}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013490095282989521}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c5831b450c6563b2542839e61ab9d61d944fe8d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013452948996996296}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31166666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013376268790982103}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..98647777e733e7b0d32d7afe6193918c226c30e8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31166666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013376268790982098}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.30666666666666664, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0133166423190707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1b473a385bec6733f4cda30e4b43313d47c5af8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33916666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013672343491681819}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013570806258433628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3d2ab37696e0b1eba7a4a4896554e07384facce
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..90ca7c142f14141ebd93041026402bd69dfdd4ab
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.30583333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01330652625583115}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.31333333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013395739415639082}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..23d80ba5abacf2d5efac2033ece96ddd452692e4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406401}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01360541734571053}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bc9460f2bb3e11a63b6efa88f850ea2f7051118
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013508372867300224}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103247}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..25596f14485af66e87e6a6fd093ee161dff742f4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.305, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013296358936471115}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.31666666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013434078660827393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff91de3fa22b9eae8d0a3dbd3a411fb89f0617ed
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2226962457337884, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012158314774829931}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2226962457337884, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012158314774829931}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef0d3ceafd7fd943b4f182d4583e051fcdc1a0cc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012288926760890797}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012288926760890797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..91f0d4da28edda690cf4cbd49d1880cd42073de8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948854}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..386d7fb331ec9dd5290f40f8f3439c9f88f7ffec
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0122728535825408}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0122728535825408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3612d809368b8d4c7444c7210b0925c58d0de8a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948854}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7af34edb0fdcc46d55ed30b6a43e4df8689fa81a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948854}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012336718284948854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..183c979c4f6e9d54fcdd4bc648bf5b4de1deff2c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012862523175351333}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2960750853242321, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013340916085246263}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2801f08af7c5a61487084a5d9ac5dd02c117e3b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042967}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29436860068259385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013318528460539427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c4e872c07845169f8cb869d17cba8f187784a6e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012506564839739429}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.27474402730375425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013044617212771227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a5a138743d0d575e67fea370f010bcc1cc2aa2e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01244577002802621}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012942030195136432}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..93da66c0267f3036bdcb6501dad70c4da033c2b5
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301827}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.27474402730375425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013044617212771227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e19d286693f71430ce16637f3ded9e28323b42ae
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012368225378507156}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2687713310580205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012955065963710686}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..621e9c0b5b1c61b78077696656cc1ef3183552bf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01266819862131543}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2568259385665529, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0127669237941168}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c842b30c82d3d199881f95664834e3371051d985
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012256708602326907}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012624912868089762}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a376737ef574ba4169b4d507e4a5e3af680afdf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2167235494880546, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01204015671348119}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012256708602326916}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e79e2bb7f3cf50aa54416291c0ecbd26e31b4d5e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.22610921501706485, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012224202097063284}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012272853582540802}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..71ed56ed510d201e771a6439844ce61cf2aa3ac5
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.22696245733788395, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012240491536132872}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012506564839739434}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee48d13c340da69cb43fbddcb95eb776900f40af
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2235494880546075, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012174896631202612}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01230492841874761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a2b83702e0d3265ee194c6efc8f1c1967c1d6fb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012207839995407305}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012207839995407305}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e62eb5ced28ca515dd4a758da36001355731143
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22696245733788395, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012240491536132873}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22696245733788395, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012240491536132873}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..62e7ecedd06c0df9de3ad4b32deac383ad69c8b4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0123525070426174}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0123525070426174}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f36a0403670c720b68f5d416fed1654549e9a0a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587094}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012536554144587094}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..59511ba28aadff306aaefe09174b89cecfded66f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301832}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012414960524301832}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff5c2a634a47bc53d3650219f87743717f66c377
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012476304127453963}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012476304127453963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c14f1d61709e89553aff3d832c5c12db662a350c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2593856655290102, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012808273573927092}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29436860068259385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01331852846053943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6687306b8d6c65bbfb016a345dedebfb4bec5373
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25597269624573377, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012753013241244518}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2935153583617747, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013307250444941124}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce3cb5d6ced15f08c67914125b878d19c34918ff
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2568259385665529, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0127669237941168}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..674aa8074e4b2ec31222e99e3eee43ea02788fd7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01282193022511255}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2696245733788396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01296804068686916}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e2ff4c490693b460363acafd992050bc345942e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.23890784982935154, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012461071376316616}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2790102389078498, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013106784883601346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ece21df5abd56da9c81ba57d590bc976d3db6568
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012430399829260847}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.27474402730375425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013044617212771227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe83055a6113f9398ac61098c783478705c84272
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.26262626262626265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00902986177676375}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.26262626262626265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00902986177676375}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80d96a92fb06c912e8462434d63e24f2f16d0298
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00880917174472056}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00880917174472056}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab8eaa8614e2fff244ef6f1860df2c4b0f7fb8a6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2361111111111111, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00871448049171129}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2361111111111111, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00871448049171129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b75aee67cd2eb9241ac93c3ffcac5b11fe16930
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24452861952861954, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008819461106822605}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24452861952861954, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008819461106822605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d938a4b3b4ea0f6110c8b5cffc9e239c53f403f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008844984581934896}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008844984581934896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..595a2c96effdb252d68e80f86ea6e985e2c51c14
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008910024163218198}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008910024163218198}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d54ebbc5b3c92a2dd755bc3cdebb56642e9c975e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.359006734006734, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009843424713072178}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30176767676767674, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009418994158522521}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c7c80946dbfc3a450271f86e755e1cd05b11347
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.32575757575757575, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009616642976885977}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29924242424242425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009396447162309824}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f461037764a6c8ed39949be22d7396f936e537c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3181818181818182, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009557408782506376}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29335016835016836, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009342508331708558}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce9b39ba92d28900a6ab56fa288a530313e79ef7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3194444444444444, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009567482017268088}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2958754208754209, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00936585413414006}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..074f5a994bc1f3c3bc74e191cf7ee69b0f638fb2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.31734006734006737, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009550648343947768}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2836700336700337, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009249781691140744}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2be2bb10dc69eeb83ab1e4a5a48e40db38007cc8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.30934343434343436, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009484615220606831}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29124579124579125, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009322788837938863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dd334a6bacefe203a8767eb9fc07c47b1e29925
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2786195286195286, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009199329195026362}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27104377104377103, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0091209197417606}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..06da71a0e66ccbdd3da8c8857422e823c5f35f93
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2727272727272727, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009138630726364233}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2760942760942761, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00917355987383526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9953f2fd74852d4232151e143f1fe157bf7e933
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2777777777777778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009190779909649912}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.28619528619528617, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009274470774627728}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..be0914d1324a2b505032bb2d60f65db8d40b233f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.27146464646464646, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009125362970360623}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2765151515151515, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00917788010146828}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1600fb140e108d1c3658ff6880053b327c22b8b8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.27525252525252525, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009164888895174743}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2760942760942761, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00917355987383526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2604ed26a62fd4d3d1c0f92de42db917ebadeb43
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2676767676767677, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009085000147099363}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2727272727272727, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00913863072636423}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2af5483735cffcc1a5f44045e1b6e524ff2863ff
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.26052188552188554, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00900643589033659}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.26052188552188554, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00900643589033659}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7177aa5266439d66d48828023c398625fc7e6e9c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2441077441077441, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008814322157999387}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2441077441077441, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008814322157999387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5188342e55dec4f6fdc61b19eac7ce320f0c822e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24326599326599327, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008804009846865538}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24326599326599327, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008804009846865538}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8ba6b9894988d8da1ac0057a38f5d355d2e12be
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0088449845819349}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0088449845819349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d8568854a84691bb64b3819e0384287716ec6ac
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008885233166386385}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008885233166386385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b0978f3fd822ca53410d5e1805c38f729b8cde1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25084175084175087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008895183010487386}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25084175084175087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008895183010487386}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa0f7b1c205d38480aa9a1c1437242b1bcbbaef7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3367003367003367, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009697166595752467}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2958754208754209, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009365854134140057}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c280d0d3444554a916700d265aeaf56f05f3177e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3181818181818182, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009557408782506376}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3005050505050505, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009407763090599316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..435adfd5860386696a2ca1a45ce29548fd3a9f74
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30723905723905726, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009466688832475378}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29503367003367004, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009358110551087425}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f1a8d123eb7720ccaa3b38fe5efba24558660ec
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3106060606060606, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009495260551195607}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2962962962962963, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0093697115856843}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..02536a431eca3db3c4db84d899c96a4de6c4e279
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30092592592592593, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00941151619378719}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2925084175084175, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009334649503078416}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d307299a2cab5f567d35a4eb82e6f82d87b83728
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_arc_easy_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2975589225589226, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009381226721815537}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2878787878787879, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009290733161670159}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eadf7a069d298cea5bec4499fb2335d1b0b78588
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5886666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008985524690229497}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6213333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008857326053368308}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5dec125c66dec091d772638c5955b02bb212f93
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5873333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008989877766895466}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6216666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008855801251873015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a1097b7de8db5197b0540f68355a38cbdd10697
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.564, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009055127374988179}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6013333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008940758594209426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..13661670380951d06933e81e77ae8cf2b789089d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5533333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009078141663938732}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5896666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008982215188519148}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4f91154b8fa9032d83305e60db5d604d5a9f10b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.551, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009082611478924389}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5806666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009010624844204292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0c6531b7843ce917847112ff58ece13ae4a1f47
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.546, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00909150987738651}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5736666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009030591966818144}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9509ca667b37d92709e2eef9740b34645ebf4739
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.621, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008858846410222197}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.4493333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009083233528874787}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3fa8755c2e8cf4f27129e727937a3f742448326
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.568, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00904540065950836}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5613333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009061278956794627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..41996b38e0a3805c257ad2eaeeb6eacc3aa24653
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6133333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008892593055774285}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6106666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00890378508047089}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..046f2b9dccf318a9b1d64de57c96963bd4cfff8a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6183333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008870849530787626}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6173333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008875277637761272}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9eb5e908d5baf815ff5bf425bcad262bc8795e7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6216666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008855801251873015}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6216666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008855801251873015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f836ebd0991ec50e3152d19df591b0bebeebf63
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_after_reading_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6226666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008851200156534388}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6226666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008851200156534388}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ccbce3a46655f30e76f4ce9689ef20d8a52869c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.593, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008970906255948518}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5228956cc88a5697d3f5bf9b8f54e1b049110507
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.573, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009032396953831089}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5613333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009061278956794623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d487efd95ef1416ae68df61a6644f3798e6907e0
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5823333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009005596833757835}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5753333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009026006087500427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5c403da209aa01186e70abc16208a7ba8e30a79
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5876666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008988795877959723}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5773333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00902036441484364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..529cf1320b0b68292c0e8f774895e444ce88841a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5776666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00901940941590418}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5713333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009036836097555087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..15c1f6dbabd2d6e09e84cba7961d843170e8fb60
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_exercise_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5713333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009036836097555085}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.567, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00904788859878573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dde967334dcd1c09322ceca079a02c09caa179f4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.605, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008926639623340282}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.44733333333333336, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009079439381402944}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cca469a65ac920c6a80d6e079b730a9f12335ebd
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.613, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008894007408882734}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6126666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008895417372116205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a74ffe2c10fb19e90d9ff73b39d8702014c1941a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6116666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008899620943397689}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.609, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00891063782727302}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..779ae5ee71a15717a70c52fd7e44ec2c01df0736
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6143333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008888323636208591}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6096666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00890790983863795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a45a1fc072c0c36ca68836cc6c5730a07f88d57
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6146666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008886891702571046}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.612, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008898224137298402}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf2a7b4f2ee30b50f35bb5e8233529a88cc1ba8e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_valid_binary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6166666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008878207616769261}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6146666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008886891702571046}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d386a4f0832ee672022f7bd7cea7a4f1e4dd114d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.596, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008960362494453696}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5ebfe4970fe90124be8dbb4fb28e8e90839f769
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5403333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00910047692710895}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5416666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00909847370190195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae97b9f0c25638ed515e2e4b9212b02e166a75bc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.585, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008997332048705705}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5883333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008986619341172333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec82c75af73458926aab1adbbc435e9cc838d5d9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.604, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008930542249025198}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6076666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008916041436343385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d71d5c587a27be755cf3af7495cdbbb7d76a882f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6026666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008935685051576499}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6103333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008905164372580982}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3b98badc7d1b154b1bf02e98a41247b359f6cf8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_boolq_yes_no_question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6033333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008933122315228997}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6116666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008899620943397689}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dea41bfbf94b6c4872dd69909190c9e5cd61f51
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..be998610a3c2dfa996ac2cb3ebebdcc2b5156993
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809221}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.21183261183261184, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0950afb25d08b2f3785550664838ecf74cfc0a20
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.21400304414003044, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0886abdd48abafef869e764e4a7023c079c3b92b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.22946009389671362, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..17d82420dd041fb0b450a488b9f51aa2abfbd616
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.230804179918219, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bfad3dfef4cfe1eb854ae71e9fb50d3878e5cde
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.21400304414003044, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c86b76deea37d4ceb24ee61a34c3a910114ba9f7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a8a48d4fbdcc5fcba7457572382e5d171573226
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad194c52545f48459cc2486b8a733e496830c8df
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.34, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f18bf071e4ddfc33b4e6037077e12ba14f070687
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.37777777777777777, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cda1efbce4eb496bcbd768ca472c757d6317ebbd
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.36377708978328177, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9de0beadf8e3ee80c789652f44a562f29e24a591
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5535714285714286, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942395}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.35643298415256514, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aeca7edf6c8e5da4cf87823e20153982692ecc4f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3764875586007934, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5de612509780826d83137bbc1c61742adaf556b7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..40747864f849566241ac5f76f405631b8f983117
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3466666666666667, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..531741280479f8bffeed5f6c9182742d915eb56e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3645231677576691, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fac6a032d2d57339c003692d43c1ed6205559783
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3587301587301588, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a0e7b16d1b972d3dd9033fb8788f6450c450192
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.38165374677002584, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b956a63c7971dd3005794ab91687977d4a00f97
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.23214285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.056929390240001085}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.1873873873873874, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b3f7ceea5d0f6f0885ef6a525b4f9138f6760d6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.17857142857142858, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05164277182008721}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.18229665071770332, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..05875689f6e1219ab63a5b70bc683970149d2b92
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.26785714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05971290310957635}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.24590576971529352, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b37164e3430ef51b8a60d9795eebd2ce78be9f4e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.23214285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0569293902400011}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.20256503424980152, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..060addc935f51ffdaae700ab473f8d25d3998485
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.2857142857142857, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06091449038731725}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2244393241167435, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ff711f568ad5e67057f71c9c36e12f6b3e38630
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.26785714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05971290310957635}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.17522768670309652, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dacc44905f5ff9081aaba9f9271e5a3ba9e1874
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.625, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.44338000491279783, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..69c3154134cdfbc4c06c9d1f2c3008e6db1eef98
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..40676f5c8c255684c76c239f595769b053672919
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3494339622641509, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d127a8964cddafb892ba915a790acdd6f0db587
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3538011695906433, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f06e53b74edc6e3ac3a9d7d47dc9279598324d8f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.5357142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.38467432950191566, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..90c61fd6b9478a4d5ca04e03baaab6ef0be993c3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_cb_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.5178571428571429, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06737697508644647}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3583333333333334, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9933821aa71ffeb91c52ba36449bdff53c49838
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8bb1a592dad2e9b9a6a3193f92db8f2e5cf528f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..781b3981e2f609c9daa46fdc630f2d605de526ca
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..604429d1c49dbf4f13ad6d0b85a03828520b2f34
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..63b659983c96ccf2d4f47863c7866b27589d30a3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bae3724803d9955436a4cb7aa526e840272a0d54
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_best_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050161355804659205}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ccb7ca2b1a062d3198d677aa72d39281b9b21e00
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.64, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04824181513244218}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a3f65bedde213f814d9f3f38d0592d505d71ca1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956911}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..97b2010bd6f7601bb6d73482a90a47cde4d8e513
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050161355804659205}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e5e61d093a032bf3074e9b70dc16196fcd8b256
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..88af758f74af31177649ea1382adc22f3a55ca71
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ee54c259550c5cf96186bcaaa132d2b687b0067
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_cause_effect_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..44c6a05353cd44a4a8d02e0dcd06dfc13346af27
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.68, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.046882617226215034}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04960449637488584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..64ee0102e76af5841a0f3ce9fb9b15c125d74c81
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b31fe7d590dffe0e189431beb81b871cae50c976
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050161355804659205}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b85617c8cdd76d99669fece42151c7279e1907b0
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..643823818fdcbca822c10804d48023c08ccdcc1e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e2d6b0b6d864447317650cd1df99f971d496715
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_choose_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..26f3e834aa9f774aa1bb861ca218bd9f700a9361
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04960449637488583}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b4410a96e9e3d635aeddc7579915fcf969015e6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2aeffa304cf07eb964225456a2c2fa8e33463c00
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049999999999999996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ee3b79340ebf053c5c079d32b091f835b28ca8d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049999999999999996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..567061894f109c70fdd50064c484e99e77ff4781
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..27a5c02b042637fc4a262f49cf7d6d840dfb8ef8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..82777609f17664ed91d90f29af705c066e42379a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbcc08c06f5427c5e21821a05ac4c7934e5eb535
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8dd8ef28a64c8769d88b4e8c45503db5863898e1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d82e18fb9af71ed2dafa4b52876018f12df2e4a7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..bdecf71e8bf0014b93710e097684bc9e6de8a763
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfb00631c88ec434bd64d433d890306b59e80004
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b74d91f74718e57bc3dc31f99637a7dfc8a9f7fb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 2.893181093567166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.050771258383273614}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.22721101652903636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002174631786151447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.37723322238831786, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029605308604336612}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.273629646761964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00219562740727204}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.07640865030189278, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012649024640329508}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.1250175906265568, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019464934211797466}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.09100215554293947, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014003878631314351}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.18768076986646293, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015662913202835055}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.31630912991285143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023181337659267383}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.22738917572581716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015849122049742528}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.1945842233274229, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018735455393389011}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.32636244393347086, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027067207730447885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.2352022676875879, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019310924562291735}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe8ada879b280f20f9ddbcf562a50d54e1665ada
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 9.399402743953457, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17924144077003742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.47386061055231526, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004267753730229592}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.38119947510045676, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003610048652083222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4002878107328038, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003311711914052721}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.21711896294585206, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002796447264323068}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.17110129874947536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00224417594257099}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.18025992915993652, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002178035933267876}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.34515720589173515, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003487240459637474}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.27473181051861545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028013542645951507}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.28934035658943014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002625512046929914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.38735824622409254, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038432739893741875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.30946758998472124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003145910771718272}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.32576004187947605, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029528457213781733}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4149ccb37291c493fa9764b9ec7604c3cb634aa
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 12.155518015461812, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10084300095584796}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5512470663299613, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003380008431651804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.44400338391624267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029538365913921783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4675919630941539, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002380211778611894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.26274066794234024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026741792596663163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.20911393627659425, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002213438817982695}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.22010440502587322, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020527201506163662}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4042733522247706, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003014974305975856}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3236197121255908, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002493240331238564}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.34129076382732576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021490279108278207}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.45311242528524515, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003250337053382386}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3632843159031807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002717180891099093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.38322006389684193, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023453120078189096}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..35bc8773a72378a20e78ab4dbc42c9d1760cf905
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 12.818480424555865, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11880096488769476}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5716424958707295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003245966702252894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4533042534124351, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028426294428620666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.48141444524720783, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022227475636040234}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.2761805127749399, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002656621555462259}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.21740253682265098, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022432953731971084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2302982218465413, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00206470098719534}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.41989832427018686, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029524344642488424}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.33057724250995923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00239153458363529}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.35184243194966225, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020609113731491014}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.47142486480907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031553751764853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3723049045197885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002645921584578156}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.39598482328679385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002247763634503275}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c8b94bf8b0ffabfd498e9b6b1e1d82cf80cff1d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 13.023419310896914, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18787640339568382}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5811755264625634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003279952136584487}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4531163332976295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002808885692946532}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.48507558149893437, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022388986956659146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.2840273053174427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027473207641567276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.21849814391644223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022164052552589037}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.23379683139470345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020763058969761714}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4270926156453398, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003003901373753685}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.33141755050914024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002423830245139234}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.355063998866477, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021030562380576637}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4795421964847328, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003223552047844655}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.37256874707266874, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00264137203801963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.39927685598165713, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022893014461836786}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2a300f5a1ee5b48ea1aff618d773e580ff66930
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 13.13874169567919, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15764383116806005}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5894664795975357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032849183519925037}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4516858553529017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002776001645788723}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4886001619644043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00224088818064398}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.2911074453008736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002754346176578065}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.22039619320267384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022225995765214612}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.23823194648216817, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020968310082903024}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.43258691970018376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030113124612643483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3295502069878702, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023782291443152845}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.35702681875777975, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021061422127890642}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4859598571636165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003236922862238485}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.370994090640816, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026190329216342714}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.401781353582733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022967926955326317}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..552692ea4f1512361b4c4b667bac9043a6a0b9e7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 3.0806537976803305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08654959981418281}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.24923430615693465, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002552336287648743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4156926007465285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034726402924926524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3040771492183391, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027859763603378437}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.09047270537738747, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012069986829003393}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.1527487489689877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020328633725306884}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.11078262934301955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014268073506531276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.1751861657827373, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015166532602682921}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3033207669558517, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023528098672245283}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.21656571042527803, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017004253286561153}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.1972457019261279, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00203359756864001}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.330657714477746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002879107948785049}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.24105076727059485, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002242976032324967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..758806d2c6162c9391b08ade97f5b5875f2901e3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 9.677480998476096, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11714095702469358}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.506140960041348, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003945234017119319}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.3944081946120404, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003355916640555891}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.41881888025365843, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029474810209025957}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.23089703057281993, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002767397153643465}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.17599343128586165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002162757026079701}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.18735030535151137, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002076624224100413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.36929998823109533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003311934706749638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.28430568800970424, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002617795785800394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.30292303705529133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002380196460573085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4139933304264028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036069187091296155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3206920362797161, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002946183435512405}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.34120550977343506, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026792347942173355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..56444979dd676bc2535c917d1fe81d9cb44669f1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 12.05612102268846, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11261168026208578}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5563180574898297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033097983809663666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4433284363370441, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002894846309829138}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4688987306368925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023019227203330225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.26400701514420555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026576362464051425}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.207135141008292, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002157896147095455}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.21917697225253102, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002001502790757259}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4084452661546365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029981863130726506}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.322698679438782, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024106169595112235}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.342134715446231, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002081984836426825}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.45805476385316257, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032136214215560794}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.36275138594783457, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026398678390256143}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3845332666730455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022694145068922577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b53ba9b0d6104e6970d5d7721515fbea90cce2c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 12.809308859394257, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12012089885991}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.567197996394036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031941635563547547}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.45496464714534934, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028587538099377643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.48064245798959665, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022104878204895783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2713808301404379, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026621379369616456}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.21540061776410618, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022399829182060465}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2271609238297581, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002049696551360146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4154154613540611, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002920856843326935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.330654493573185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023883139342088418}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.34999974120577987, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002026696417621307}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.46722317705643734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031218760227686975}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3735607439910476, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026634600896822715}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.39498527306923364, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022345222837571576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c598465d42e81a8708bd2bc02cf5f57b5cfd62c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 13.080220013614857, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11160584714298007}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.568344864604142, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032244220520078275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.45346556859050147, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028165310549028376}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4809596449905771, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022389806016051804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2747057715723681, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026868813527321516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2165017740347286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022132654915194188}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.22940323349641095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020525835595875536}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.41712827628191324, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029432229582722754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.33039201411660385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023910452617168718}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3510224338515258, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002061496441416912}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.46944298374040827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031624401569722853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3729547665117702, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026487677764114458}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.39611863123059127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022799004681333724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0a4a560bc81da1f69678ae21f7e12fd1263a7e4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 13.126754982894552, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10360087872116377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5800244423214129, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032430845106271576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4522814700737286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027480362924234376}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4854500633563135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002219496097131078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2817904995507888, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002690162257773001}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2173596999234223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002187185451199481}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.23322234263569483, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020699168576186902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4262225377338777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029779169967996694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.33076771892860724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023724569222499923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.35536850445962775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020947916034111365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4805351741121597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031782540412465792}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.37371143448503646, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025800203198591233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.4015046168561641, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022668461919812784}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a43249eb33934ddc1082a0c000a8b622771e127
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.09272307710118581, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.015507299531691698}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.06857149404466026, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002714756076276083}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.03009537274390525, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009494627429628092}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.03728897813365394, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011576994918079029}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.03172208574045067, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016960895743799115}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.012438566317548796, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004694763784147565}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.015494426267291282, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005915200199122902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.06643255860316054, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026515653043268214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.028870230253503604, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008942016978715927}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.035838656895578976, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001095355689892508}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.06575040740426258, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026720246479516457}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.028116552151616833, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008796291436374449}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.03502489652281287, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010881026456914845}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..865885e3cf869294f50023167d2009b9363f3cf3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 5.652249507377061, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14061164623877323}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.31222666399245746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038409642516790284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.2665121107894025, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035125435383699653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.27298366766974236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003251481554695709}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.1333588804023989, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023201775029695754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.11363089991243877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001998959577037935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.11595469608797514, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019122129455938347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.24182600036670712, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002834935214515838}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2036428358852747, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025389192347628087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.2097551265064886, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002334085005475068}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.26033955477578685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003317352229551968}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.22088539882783978, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002969599084643113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.22681833491189718, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027772963952946224}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..335c2bd8837a93d1d2d6cd880bc7c5294925c4ca
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 8.858168867127539, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15358413684529032}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.38944127465466044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004503183294613872}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.34014174486376614, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004015461323957741}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.3444247303322821, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037208321217199283}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.1807977835141012, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002801765538636182}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1572154217818733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024179638680786484}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.15850574855645186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022819154552772038}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.28782843422256305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003320592623914537}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2498454599165124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029373372282745012}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.25349129697286293, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026956970102644516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.31914692411759643, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038546708339040276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2783358271015542, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034103422656029617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.2819365226348762, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031783863755039652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..83c112febecda057723322d03e0408cd10370c31
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 10.35853514789337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12387147878155497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.44248133468645384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004497810429174453}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.37990230663646296, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003927199237573467}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.38990963042752075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036858837285380693}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.21053296232652152, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002881133238894378}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.17980536378141043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024724275105467686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.18411460451025186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023707420345364556}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.3187308429757642, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003373786617903847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2728727940909337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029399293650148287}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.2800852266792753, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002743206810693273}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.36059219853066166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003917787632492712}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3090446652384135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003388294524398576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.3173742799146112, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032163177711543606}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d08b3c73e4a4cd5f33d1a3bfa44f597744c9838c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 11.515556371710902, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1521281614192603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.47723640823076796, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004287267359218536}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.40660753931112564, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037138867542392297}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.4195774576048078, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034528003933074034}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.2291073642884891, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028107927329324915}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.19454563878113562, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024341427164360544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.20036043048947655, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002329405952733126}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.34087412726024163, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032590582635772347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.28966452741743237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027984443113154285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.29891705821656117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026005964258569996}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.38895612444514427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003776148236077503}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.330648039857798, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032240413111297886}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.3415819675283937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030544859332802426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b9699e42891e27c943fa0157e12c5d676302220
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 11.962882852349749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12492026969156166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.49857309965795027, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004095692107512742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.4195162433312554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035246726867291895}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.4373024691627277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003268691629876463}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.24035982989846097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027975432302982687}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.20137381546159938, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024063007560350337}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.20952873474512598, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002313066132505112}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.3521991129833718, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003123615311072105}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2963793856785372, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00270044093552581}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.3086640751036808, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002504229241203423}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.40439342888738805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003612123346774531}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3399434796164529, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030955619182597956}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.3544764119864059, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029181394624907212}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf0a3c6a7d07a93458deb051af8c6550aea76a13
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 3.205452750701432, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.029369723752371055}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.18794203900806028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002509182484953893}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.27563499652889517, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002522516313059265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.2026666683966741, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001854044988549804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.04736777911623166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009912253924337147}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.07529125273517918, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015129136922472945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.054594037237788585, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010529893200416562}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.16091347621914998, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020468972719200795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.24052353002182375, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002110543832931725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.17465641737008017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014217853180858962}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.162408167198493, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022254242193644625}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.23809586470877792, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024056082119999767}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.1754467264102351, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001775758147384932}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9320374bdf3fb15ef28bc02808414f7a37cd021f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.314986164630461, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09886007247543747}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5405367881450536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031944831139177473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4265799747472657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030051178365182673}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.45168976701280994, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023803410292144262}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2530645509899577, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025820188553003945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19690559018039472, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021546256249304957}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2086255432248638, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002005283814444169}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.3941062713485247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028671074248121063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.307687838606323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024236181103855607}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.32671702545878634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020575751841782584}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.44239922810973775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031220549443630044}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3472525108890491, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00274280048405182}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3682903569907333, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023311514221899116}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3374239d5df0171b1a9d73b5c4542b4e1edbfa92
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 13.825382196007423, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14341892864265515}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5584732398385521, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031726923154704037}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.45947280161409504, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002947956873468934}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.47949895116223057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002274496687632488}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.27533931731718275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026239982120668894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2249642538757707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002262594422709972}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.234132554236615, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002050727643013208}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4144330916795227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002865466880028744}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3391232820910882, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002487675496155266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.35427830623229684, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020644801949430084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.46586823562780794, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030764860572737814}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.38321253954384926, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002786697010998289}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.39980527255607845, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002302036382207211}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cb2b3e70f9ea04731c70f4579260202cd7ac4a9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.734710333150248, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13327591335822025}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.566363810944366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031329176207456266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4738543349642313, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002916750104399688}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.49284383237521995, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022741384691834527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2835262882274797, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002620037892529304}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23582515668443185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002307150627998116}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.24473795777586516, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021021762412204888}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4201683174467468, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028464798891116223}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3504188695034407, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025288933574772144}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.364532741901036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002119688594264684}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4736089581434219, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00306065010387841}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.396405785185109, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002794473833800316}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4121332244451046, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023416760964820984}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d18bcdc54ad1615206cc9946574990adf91859fb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.080804791535655, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14821971591156646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5624928789321237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031294366446319713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47592863926028084, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028908116130151665}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4935313098839454, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002271845302152271}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2834307304449863, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026187903085809257}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2386118604584549, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023078935639056814}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2467411297660491, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002101346812765226}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4178299399327378, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028182273866756095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35288616277467044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002503800280122615}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.36587999086871936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021050945906898252}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4726074434579816, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030529008210380413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.40030956626841174, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002788731746840601}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4149830902149149, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023565846719075163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a05232a24ec38382d55586978dfaa494be98de01
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.265600470980436, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20285527803005138}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5617758997558588, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031160645760995214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.477780058424707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028609800200870505}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.49535698221178465, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022639532726268564}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2848892339570606, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026387395366262536}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.24077912408729735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023046951728657333}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.24925086960246148, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002125288231383812}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4192693573282302, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002855212675860218}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35537258514643544, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025017841926330076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3687908505277805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021558916218128877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4743649010098781, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003080316334291657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.40318795066110624, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027745110069664625}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4181866743293669, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002368198674279566}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..268d5cf7c284140065964611fc77d3cfc78dcb6f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 4.571492146276437, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06410871617063811}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.1908676146970543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001194006157470212}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4356617762292338, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020338819023735448}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.2605612470084477, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013649714000006274}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.08067191193687241, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007328285267680664}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.19180765723962376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00171245316056684}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.11129123255973318, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009537147017480764}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.15773820339990663, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009130906909507374}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.36537444743215186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001890266179683196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.21632110836430327, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001087079156733761}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.16951329717089128, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011272646867077703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3881088934505719, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020860272608716683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.2315858520913805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001327505047025221}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..04645af7529dee042a89bb1a89b8b4cdaedc8969
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 10.539837668585216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17465438135688363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5480364269639086, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003312334506418998}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.40745113744298617, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002799036288361961}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.44123977635419337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002211756735853885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.2555460664426593, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002740757842598831}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.18529601706006868, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020284497579205283}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.20149328522490387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019532239658465902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.40745757408421523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003071919070780174}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.29894440995242827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023043029752528896}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3250117513928856, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019969004221661773}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4504408531473228, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003267241907381879}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.332318789231438, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025568322527799218}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3607660614467818, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002203780125553279}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..00789ab060fa3e0c0a0f767dd9fb2709bec4d751
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 11.979737123111207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1777216675487079}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5694774840044784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032690447343626645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.43169795189062204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027755841019906488}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.46621835136055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021811745216102235}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.27578961505709343, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027563670362028268}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.20476632102377532, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00211282040156597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.22144793601758048, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019896499062567576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4243204735629267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003038279741971009}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3183922809822098, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023264664598331725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3449621710063106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002021110115853051}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4705230491379006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032090612787953627}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3543590674460134, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002547140809474543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3836388316327653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002192238878763976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..56b1acea0d667aa786d9415e8770420b7c99377d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 12.462479849121463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09061529648631804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.58220595007106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00329263853652951}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4397725602332855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002774975421382418}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.47693684077807014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002210569646771097}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.2839931545986718, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00274726172380332}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.21123062905099813, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021777700695380237}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.22927798524401358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002062255539404591}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.43127648354778575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030397840353640227}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3232046086079628, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023417332544385812}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.35144747175107627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002070073239316246}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4789262682132351, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032006625726612964}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3608549641566686, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025963317856745423}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.39176024632309325, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022580636053722825}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..873eea9a64bdfac9e92f2443c2c543d94dba696b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 12.721474775009176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1231471401393544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5882829470459832, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032579810236422493}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.43920151452158274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002766526893491765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.47944687065511643, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002226356134768176}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.2905552860238982, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028062355496161736}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.21355809012964339, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021815257564468844}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.23330038384366422, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020867339523630442}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.43922338563825464, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003062166327090343}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.32586336591450926, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023806352520646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.35638075600135893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002117189037709182}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4865694268823983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003236938827400275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.36252025368066076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026170321708128956}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.39603605414957316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023038968339561114}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..31482d78c318b53fb60b7d6369a844e9e903f84d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 12.578909654685171, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08947366290256598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5955748121041431, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032883465766855922}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.43546535746599335, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002731334226696082}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4799825375133372, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022136759502543388}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.2963639637421549, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028409667830761806}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.21277263188099743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021803230244831907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.2349236523331446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021027603419611013}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.44379029665218356, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031157354991706576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.323130840043402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002427154840974425}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.35648107973364707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021733704043705015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4932209061993962, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032875700607312756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.36003608717718644, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002619980949391493}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.39699261454485735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002322759270632993}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c52625fbf665e8baa09fadc2348d09db0466c66
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.09615296353924062, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015634664664492218}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2420926320977713, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003742160422644687}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.13605098361826837, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002138340789759129}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.014329381922408752, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006598814426382717}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0376524650639389, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017573473002010546}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.020520962151658592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009410783718450363}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07944997329667514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011619662946455182}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.20129914286300007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002862368913468282}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11259477615346435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015919918693647467}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.07747255863790849, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012436400498643209}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.1971312521833301, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031341971345006906}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.10992024258016028, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017233150363981779}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7154102266273981, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07442684379362219}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6217c7e804363888cc1e31f3dd0488dc872b3bb6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.10727843731122125, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016429787642985747}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.22812552760950588, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037592807959132097}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1401897846455214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002065563130684558}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.009366381220805652, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000580611530069708}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.02227051513823528, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014598457069566306}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.012848193733260807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007993459523072702}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.07737127160035749, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011488670401891248}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.16356799430911556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002554947845779246}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.10061411067602495, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013753187776155962}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08597435637614392, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001284877552656849}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.18438119439843625, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030689265824266213}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.11263425306203224, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016337405096701044}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.5591102576081973, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04726830548473858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b95bb09cc9101766fde0c937d60781a9b54f4dd
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.1353701115549322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002553526241523374}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.22433509698003423, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004121136445432196}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.15686903837773863, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025131912347000135}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01735730166703164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011030459333331683}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.030592710991937784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016233577292839108}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.020439860337785443, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011077341156553913}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.10015629558085037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018598757797987388}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.16404884592949798, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002797494501855951}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11527800425354318, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017225562433088898}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.10553654669038727, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019001720096507816}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.17730755997688144, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003284544255799627}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12292764498886767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001920929967654257}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.9148312975010592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03327013226559874}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e308304043a3645d5dd7cadcc2dec485e6475a7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.14823607772738917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003044719039917616}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.22710294289015492, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004565515334844266}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16475864035896656, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029505584419723704}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.024478360849588366, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012845736132388852}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.04089886902879985, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00199146152038313}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.028016924033823828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001342276519470403}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.1122664526417856, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022829408313811425}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.17187055503947934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034382777440656468}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12440016864393556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021747021220266897}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.11602521107341782, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023229102349895366}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.1811469470518433, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038206377447515404}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12969554350306625, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002319691975532689}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.2528979414068826, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08376234817364027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d1bbf4480f32b4d7e88be17fee85cce2c633598
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.04597223962710385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029601721330201006}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.059262066595310824, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038174584010459106}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.04591039911863491, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002791474184795997}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.007607457998339093, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008488904393593581}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.011485459027641155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011814997676546965}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.008239456967253532, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008266852843405412}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.03533300507198341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023173210243399443}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.04477820142586464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028720813204613613}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.03480791338007262, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021042096800218883}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.03659752478918535, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023802274165057088}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.04741613084877418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030892963972075855}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.03643953371509485, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022107928188988784}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.2598593623766219, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05321220151546882}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3840e80114ffbbb55cf9729e814e96a32159b9f6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012123554660875527}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 6.855686455852068e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 4.891129215213428e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.00013175230566534916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 9.393304330315343e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012123554660875527}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 6.855686455852068e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 4.891129215213428e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.00013175230566534916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 9.393304330315343e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012123554660875527}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 6.855686455852068e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 4.891129215213428e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.00013175230566534916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 9.393304330315343e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e9047f8b95c4799084b72583d2011613628adee
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14392087254096253, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00191218773257099}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3335510125155157, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00424066588589097}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19753122388376124, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024387420146840884}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03293453791040819, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010849848050450634}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0801087967883217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002707529002010886}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.04574277927115908, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014867843734761154}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.1092495491611276, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001447320489270279}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.25406306423955183, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003311772200910392}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15005960824064515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018551892590200018}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11488818638193536, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016338031980487882}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.26820478620797344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003770590397028133}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15789500500907694, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020981661003555896}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.7609629260492066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07394573684560968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfbe8609b5fcd652353e77bdbd33dd4847dbfe
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.16788681333512273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002996673841122846}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2659066356647195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004417630517149148}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19054528836123152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027762385529554825}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03216596990559947, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014970450474531835}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.055429634265232036, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024126272098439643}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.037445432438935236, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015708375864807137}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.1268648821509307, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002343416430911939}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2007501282338804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034454252437585383}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.14365798797701282, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002152785247950343}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.13249025632053385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002382334028787402}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.21378735588805484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038631534751242757}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15133625112518032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022987189077342857}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.7208126835215467, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12882874863062854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e504a2044cd18367d2049c53933d1d4d3364baa2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.23190411064008373, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003928445720902683}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.27111252857924834, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0040695915462987675}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.2299166372281009, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031451668265938455}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.054979050862230205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023263664060471064}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.06284480502767972, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002347005223009533}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05345238966079452, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020416410892504333}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.17596884203561058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032896946978595246}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.20395858533104694, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032166129006930004}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1733526719843541, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025885431117299053}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.17981583590210734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003268601816031648}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.21287210159906927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035404193682421043}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1786191806974414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002644651853081322}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.483152381487638, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12567252097455675}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b0602dc2707f7200e5e6c3f80c8c05f156d8f24
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.23800276566402645, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004262627386705336}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2467737856414683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004201225613127943}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.2263782642164461, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003565639023860395}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.054965936961357126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002410881639820072}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.05596347742993157, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002284054310082863}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05162608161498276, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021219089599664137}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.18015571246200018, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034901024101091738}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.18447874941589523, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032545586161932083}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.17010227183963283, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028684700357903712}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.18347830366600554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034874495221723506}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.19105755973960079, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035164481426434046}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1743013875297768, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029210490230557468}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.6449487563680383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13528609048586093}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8626d57b8fac1950f1222e79d57ffaaccb02ef36
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.06594749726663383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004007317645309001}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.05829162428133866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003473909680439278}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.056708763963365785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032944667691835}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.014718326865253737, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016135099274249497}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.012961663415679227, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012919513912192295}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.012692331817431131, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012786487564727808}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.05107085537222431, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032136006098534547}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.044208562337105364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002676338653334541}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.04319815386278401, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025538782731808645}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.05196861792963686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003240814781600669}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.04565880850659429, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002808420561698131}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0442062674611244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026002268489095344}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.1609104692223714, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05303525426487922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..76504d7d13f2a7e7309b424996a3758a1fe19bb3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.002686128768259734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007731754762039508}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0019582859026840618, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005536575190801022}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0022309537417988285, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006289284051050972}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0002260595490340218, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00011315061618655328}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0001350850407454181, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 6.834253022123397e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0001681232813308285, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 8.442906183093734e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0015604855435599053, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000436200986701868}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0011408291729870085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0003164044579693306}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0013014290599878409, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000360060633092522}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.0018928183051379502, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005458572053716986}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0013677555797593326, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00038471438802225113}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0015671971090940894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004425697142281073}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.457981907406653e-43, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.397335680118716e-36}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e90086e0d7fccb46eaa58ec312e1954b0a4fdae9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1544803647320562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001998179323003808}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3617501123052072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004401350843198141}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2132476859870225, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002596728123368581}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.034104422429782094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011296363295058075}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08341719921119035, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027753920629418876}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04756359555453488, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015388801074832958}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.11047768692113931, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014728181415206798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26038442217264174, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003368032246087583}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15260364524737308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018980795169799797}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1235706334215309, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016790133111005653}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.29156249557144304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003855753079955984}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17085907541578757, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00219601338601483}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7911957794729954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1028992752886963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..26411eb8b5b118b8458fe723a33afb3af9f75d2a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.18210955050574165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030336588185256538}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.30765438008746904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0045764351815007235}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21212307020665455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002805971174994527}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03608017605336464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015351821810233588}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06530630731492977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026678483835833266}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04293890964191621, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016754133257735123}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.133576425244387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022712598584840454}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2257975651465562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035054216942714362}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15522658921799362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020874537466411373}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.14117554206112032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023065404958774274}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.24343402256399355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003941092857511189}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16565749532885793, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002253857246658309}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.9369867872208306, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13301743127426643}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..038eed7f2a88c7dd89e5b763042cc4327ccc1023
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.2105718627026518, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035778413560214016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2986830414520048, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004329540379099283}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2277383324826566, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030573712743888364}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04843737420278684, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002078874124454135}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06777424154476913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002554380728108249}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.051547066687828734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019622251118453704}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.15888578947684742, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002954896445415003}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.22275234069768626, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003313554078249958}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.17058950590239796, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00245256877334995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16407328548937136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029472894981933152}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.23520189891677276, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037453526896800695}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17778653214796106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025471887620010904}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.133141577656285, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12965820672946443}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fff2374cbf986ac05ffa9090c3585acda22e45c7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.21408207074509233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003911771705487585}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.27616262040256995, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00445926401302726}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.22352993200471194, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034158533172611158}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04872713712902665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021207481086453045}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06294658987847104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002453274077945568}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0505319645754636, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019929881863291095}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.159438518303562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003102061741719361}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.20607630574913904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035265999447064503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16637492784414312, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027123155106531316}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16373162683207845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030891132218923796}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2156279714811016, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003810201919065186}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17215854927610266, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00276757297974389}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.2845962769932853, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1118763427251602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b03fa5ea00ce0c5f81d233b965247de26b4e0a47
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06392907254960759, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003876201158464289}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06750288591690386, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039913615182408215}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05885821408809552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033400021306721392}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.014573840849595909, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016332201756935874}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01510184445883284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014126505769374265}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.012855134170914497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011864715171664183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.04895381713163094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030882253843767114}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0501669133523796, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002985189039759907}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04407299444650032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025320491349861813}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05093801249895102, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031687581333617715}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05318442494362772, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00319392235102553}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04628126164174725, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002645018363815525}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.3124946765538571, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07230250867579978}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1903d73cf102c0aee85ce0245b1a33e6c49acd62
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002395128003303927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.000695243693352885}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.00195574424379908, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005505695407964535}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0020718557755462, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005803475030367974}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00013792342491448747, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.95881099815707e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00011258133899643333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 6.616712432800021e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0001226302404615478, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 7.106807371334149e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0016797182212470382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00047877790197405523}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0013438433718934864, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00036757969239639373}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0014461673142424087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003992063154628714}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0017248567966735974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004849887493451295}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0014510474885315653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004115402839485218}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0015096956796575665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00041391116095822775}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.0828965147492013e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.1508906223493296e-34}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1803b20da03feb246c44f6d48cd2a4c76e270830
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.13956776165068546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019199393436886713}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.33339512043851244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004403518880734348}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.19434908474166693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002580674236992386}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.02954528480583196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010278446226912197}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07384906277566924, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026006210708488927}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04167140246132445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014421807877901167}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10296399819932696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013531250614748109}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2481421743812119, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032958052192564938}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14366448064057566, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018391132136240471}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11110482915168116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001566443275567478}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2671737010930205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003761177660698248}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15494320648091148, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021279657138113806}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.607536775722765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08314365108296334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6478d764ce6aaa23a5a54ae7a1d7d42ab10da13d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1634034882580627, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002778000898524111}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.3297745025329579, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004476473426867562}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.2058050406751238, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026719707719642534}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.034771289771502706, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014069026543471145}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07187730165529026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002600700120998646}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.043558889183295355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015410306979479628}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.12029005613595435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021151139292946894}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2433331878695519, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034049894620247537}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15107763163746762, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019496226216399687}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.12853278724838335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021998692763661926}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.26309292044402266, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003905125784954087}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.1625706073794353, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021908461537671815}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.7147680841837472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08856857483686215}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cabe7e9c7f180a39658cfb17a940b5d82dc55d35
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.2109214359178399, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036944383537942287}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.31362251846066164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004396355711271353}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.22985666485690312, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029722882685021155}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.04995066744359591, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020522782013875124}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07352603410863771, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026284467349718616}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0536753540466693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019243134991576581}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.15911804861593265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030530508121933645}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2349429958098137, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034108172816408387}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.17246176189956652, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002416319418728753}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.16524804859917733, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00301812800842873}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2498146184241616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003847756166445163}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.18120098504252252, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002535120819891981}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.208731949110297, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12846314186145602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..68eee212adaeedb95166d340355fca5744c02646
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.21988529529321785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00419402279091732}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.2943663747316836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004684294497637764}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.22947123968727437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003432595328640877}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.053900118661333415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0022908140318405336}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.07108825092031434, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002629763702146578}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.055448723201269305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002084868367528835}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.16335356522009134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033163399393172656}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.21772234335500532, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036238704778221043}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.16982929076343528, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026892298793044153}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.16944832007925265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033011948856498242}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.23129127383235099, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004012907807026624}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.17807867601617003, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027835911872386137}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.3795755522326423, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09523488591207438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a84021ab2f450b371b73450a9f5f3548721a3b4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.06406893263545496, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003961269793753737}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.06853782086108365, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004110519937921475}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.058377571953581675, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00330940736804228}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.014371718618784664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015372576661987745}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.016136547005254933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015536759212089171}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.013120867981094677, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012122900136758608}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.04844168198127235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030911502808168654}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.050701248917136586, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030909279173779165}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.04335377999727481, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002489167957314197}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.05015315138408474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003149051173293119}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.053513518304194346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032937944931802}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.045303807045979784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025901263701828387}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.3681689900396952, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0877135242120339}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9633e28af28d173cb011b84fdd937a6715aff70b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0022196819899664693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006389555240065754}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0017648362791656974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0004854257314122628}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0019298414574400779, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005424694841327019}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.00021367521367521368, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010817206714968564}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0001435514171363228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 7.2233251401255e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.00017077154074693042, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 8.588075389535402e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.0016999247726071723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00048794679278765595}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0013511577706827674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0003704228627915442}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.0014733221159742111, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00041177963084147186}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.0016999247726071723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00048794679278765595}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0013511577706827674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0003704228627915442}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0014733221159742111, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00041177963084147186}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.3479106966936628e-40, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 7.024226621536986e-35}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fe6063678dcc80682ec274d32de9dfc95d85ecf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.15264482299030216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018892287776596762}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3634914761237878, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042673103293693}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.21237515311232552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025066211781886727}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.03436173460976895, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010891793221324652}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08578251655669292, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027573284566819968}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04839302460393313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015221033585233316}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10959444729742314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013289111469351577}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.26324151102462773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032151197123248265}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1527576047947492, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017849830465469345}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.12201884539644926, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015609231269051787}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.29254419732339304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037093261325614685}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.17003724296721284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002099393880724259}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.851658410571803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0847044667439146}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8ee2357c4b9b8cd93661ee42f9c142f9ec4faee
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.15258459454398002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028523927205491026}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.27693243154947456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004646942496742779}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18283425417693044, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027713338212500423}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.027349319827138365, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014363650968716103}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.05286158605763855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002448031923791781}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0330637303888088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014874261834413535}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.11403787794008956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002131574593844777}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.20704397122540477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035190916342608437}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.13624007426933515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002017206261906556}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.12015729412445958, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021874344457937585}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.22214054465091698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003980274533312435}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14483517980421445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002206068338853457}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.3833426601091705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07290009233361687}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e941d4928006adcaeced7a1f66a8c02e68c9ebe
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.17538129287952656, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00314746642942028}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3035380968702324, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004273062055225534}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.206890660644552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028805684860764903}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.036920195648714564, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016798438935385148}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06315746254159717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024967120331334}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.042917379641906755, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001729879440637286}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.13204571333099904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025622160400850377}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.22767522500785403, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003313578716290039}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1550000731348479, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002253269731731128}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.13759372211139387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002589486006198034}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.24084923941771985, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003691447454191783}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16271652676665874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023840903049219004}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.7166760614351635, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07548868629100476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ada4bd1dbac1dadad3f7c643a53ec9e02a8b0aa6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1722314870590178, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035575987944855476}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.2846924373146273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004686130007796534}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.19792978032299605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003191223312545367}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0383730367297948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019206595568853901}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06273993261989773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002473625781893893}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04339127675794567, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001755983116522294}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.1296329568667989, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002873167572434554}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.21395972505616653, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003537281121092252}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14829510892057157, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024301211652535625}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.13501569230738897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002923214874171359}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2264806298976215, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003998852680019232}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1557205369863269, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002608646494982156}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.8170891695141, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07365757852624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca89162ef5fc69cdc6cc443760117b8ae3ad36b8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.04719180405255273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031100681012275313}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.06440377111919457, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003989373096614079}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.04912151385172474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003015141144280244}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.010781829086988182, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001247665158486873}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.014805196292695702, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014717021782107243}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.011144625832940378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011272872776567269}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.03637888608887725, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002466167265372729}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.04889902154991311, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030204941398956834}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.03744254929445783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002309310498043706}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.037736388154330454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025300867714614945}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.05122942299684587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003191606648282039}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.03906436488072567, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024078009069653876}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.4440622247505757, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08944228394328604}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8cb72f35c698d2a40813d10de7ad1d86e7dd25d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.004002287021154946, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013363256771009757}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0005892791785351531, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00019776344757350516}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.000996772652660801, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00032675508253975067}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.003716409376786735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012418823853125054}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0005356771202161136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00017474324053640678}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.000906495501807682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0002871332293725657}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.003716409376786735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012418823853125054}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0005356771202161136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00017474324053640678}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.000906495501807682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0002871332293725657}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcb89b72fd8df25036e0587ba58c810f4a5181e9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.40989788407143, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.24713703635154027}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.0828887653019555, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0023077389471523784}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7500480520389616, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.005976873385040759}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.1383957433525216, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0030655249622799433}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.06794065951065742, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.002233204911500119}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5926367717275536, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007639129170499543}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.11292457706725864, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.003079977103975518}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.08220854090668749, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.002300716890155535}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7449059048902814, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006047208011007572}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.13726205082911794, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0030563810579125766}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.08093935926897239, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002295609465381009}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7319720220794512, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00622298612963512}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.13502746066634788, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.003057945370315767}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c8f9e480a8f13b8abdc0c0e6903ebc4d704f2dd
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 65.74799325895958, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.0647696730988823}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7066231343001421, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006476416418191835}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6882071233310767, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006815625624428434}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6842447015845402, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0066922247839962664}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5611098444032797, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007818571899546059}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.552831719648591, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007981199503729296}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5495455894825296, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00787800597247414}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.691856821666126, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006672894747449168}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6764566471109283, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007029690572886852}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6721572966876234, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006912976226852015}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6948970991963076, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00663727200833744}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6783319092636114, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006985143468937649}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6741839052529897, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006866709595529206}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..99789e84500f036faad8fa1167326833b8125455
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 67.66856848794465, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.0374101515399263}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7264533852083246, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006162751580297105}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7079928962389903, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0065376714002391}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7063241591706693, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0063895506161702285}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.584378730022088, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0076034059725184405}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5746379343660041, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007784718789034897}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5731687972529929, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007679000694598775}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.713110776716986, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006379011368799703}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6969105774629994, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006761958628976886}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6951407672322222, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006624064867867759}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7155814215074481, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006353282702585826}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6985325886365854, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006722803585394176}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6968122674715109, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006584539233814862}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd6e2e271788a55a9549b6cd55c090f71041329f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 69.28977694653214, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.1383689347747308}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7343815169425306, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006056443243618853}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7213620418894502, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006317010827359718}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7178841616364411, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0062143696022672204}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5967189945588884, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007450404459676049}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5886810928093721, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007604433786853759}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5864417429777042, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007504689248365126}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.721849768869843, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006265370504495436}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7107134184586044, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0065346021996612715}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7071359010285193, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006438542604564407}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7242948640713006, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006233954534836144}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7124045941531557, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006494187176286558}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7088299461511488, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006396597162497901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..78a81fea1dcc0b5b54dac050edbd635bc8586bfa
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 70.23533344984375, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.9051054037488828}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7346785329679391, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0060241039486872055}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7292725498470866, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006197287192729478}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7233652492867584, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.00613471446094234}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5968961483516068, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007470554111948398}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5947349139408712, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007577156941207958}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5905591389126227, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0075043884335545055}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7231778967509447, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006226778582598875}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7189070797560939, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006405887103113551}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7130555222674428, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006346247277373668}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.725163033308248, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00619102323663373}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7205115281184148, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006363858375725474}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7146446045868446, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0063049545258677085}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e838d38eb50965f5a511d8bb7f2869b45fb8093
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 70.32747678674686, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.009178623036782}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7365451997720951, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005959364204972434}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7321791188483596, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006130704848408115}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7257789808109173, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006064762057166745}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5988456423040933, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007405666798204553}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5975633970178799, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007525805689294777}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5928285347617148, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007450597694271025}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.725508588212507, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006169249161730822}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7223180805776619, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006355782042971553}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7159451295716375, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006290151298303866}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7273702460258058, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006130370372686494}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.723693210508481, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006312364180322713}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7173761682145219, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006247247805738648}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f27449b298ecd8ac9f7bfe70740ce24ed6c9fbbb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d12fd7aab9a1b5e3b5e706ad194cec8008a3e00
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.500544069640914, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899171}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.500544069640914, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899171}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..99d4aecea406225632678daf6b186a1dd918d771
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5103373231773667, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663330673075898}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5103373231773667, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663330673075898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..72b86b89541d37a83bbbcc9992317943573e4115
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5027203482045702, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665651503000718}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5027203482045702, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665651503000718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5906820fc29a9c8b0c180cbcf063bc08943e803
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2b37286adf098080445213619c3a022e329ec5b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.4961915125136017, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665485744746795}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.4961915125136017, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665485744746795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bbd84d62ab4f2329bf3915e9a895be4699533b8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1430667201766189, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.009240564731257003}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.020963380772971652, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005571764739275214}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.22474022783929637, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004232469770939408}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03634213056601697, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008852037419284785}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0034206314074165032, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00016377560466964583}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03979709302516331, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0020218681983000908}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005972148606985499, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00027694080927670884}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018785915434537857, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00045961716766157023}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.2066360068752239, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0038201093018895315}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.032687390001550964, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007387342525395488}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.017178056045255737, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0004610204906653324}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.19189650357961507, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003798369392011233}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02982601102364362, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007298100305106965}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..df102550073a9832ae93f8dd3bfa31f6fdcacdbc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.3124984853443106, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.02657457812443979}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07320008291691311, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0029027310550941185}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.19206179363741116, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004198403115580751}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07833603488907773, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0024221956295178153}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.014935910160543195, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.001066118611892239}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.035651065690208456, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.001838136787740149}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.015687196908127872, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0009548214688453516}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.061440768676818164, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0024509134853169003}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.16762193679347684, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037347291719288735}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06582546671313708, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002008068318413744}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06328842881073514, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002545161737546467}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.16802715246598093, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00380229809853059}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.06732250510342716, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002101257193346951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d94c444cd89bdb86454ab3fb84c2af767c968788
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6472470961799496, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.04741660786692279}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07353029295593974, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003521754204664174}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.0858786615791722, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0035032589854169945}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.06105404439408963, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0026311682541945358}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.016706091366165337, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0015315526778284171}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.0184312055942394, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.00146285613399111}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.013649436315540416, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0011294417436508105}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06272665005848953, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.003073312683730802}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07420703288761338, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003078170412658722}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.05190903182793309, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0022643442898557923}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06458457210352117, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0031636187057443627}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07525324884676557, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0031251840055039758}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.05329452859101479, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002333128047865669}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7cf06b7fa06fb0644e0b320e84685f5e5bf33539
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6062635978384739, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.0782482303422379}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07452134039693918, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003576611335222079}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.07615883857534307, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003364735282169584}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.06096787511489869, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0026990107298849103}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.01697672999338089, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.001565185650016919}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.016230779058456612, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0013871281299543618}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.013471867168580705, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0011304768221616576}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06522960654210525, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0032019612779513027}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.06648909991780103, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.002980799399292851}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.05302910218785953, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002381323449986505}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06706238126084484, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0032872562006614737}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.06804502241505245, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003049896418062481}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.054388195617974935, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002434407168355548}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc29a7c461626f2aaccce3960bb7380b5a9216de
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6185938082482176, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.07788119676580058}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.09281330184555696, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0040489108208186355}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.0821435584513187, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003365995853519246}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.0711834879764609, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0028766907517215213}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.02177748235883171, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.001798003291922353}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.01795915149475838, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0015049699206362978}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.01601647142341596, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001277963330949647}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.08028507288137521, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.003552480741353349}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.0721305037000189, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003031967737705387}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06189574961323715, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0025620835394899715}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.08377438466476718, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003718032093123604}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07390575338066406, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003087807865518411}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.06398308391285433, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002632124175332435}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..93aaa02cb847957b44c147662c7307f42cb19227
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6938298171270286, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.06310426002073874}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.09754684198832757, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.004080988763741584}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.08901016385362265, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0034796844878200567}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07652826047415558, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0029524964693573093}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.023058115240956877, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0018391142815039204}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.019975557763278272, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0016124179718119594}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.01743988575661535, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001341444432007256}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.08435632806664412, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0035957308634670844}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07836574159905216, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003158055668768904}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.066416662354599, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0026142608788368494}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.0870783293964629, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003725904197191887}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07928068070866405, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0031561546312338156}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.06800373888434776, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.00266716520288408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fbca079263c2faaf0728f7e923e3f388f8922b2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4956474428726877, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166538214464238}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4956474428726877, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166538214464238}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2c631654a0cd264fd543a8af7134ed66e8347e9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.499455930359086, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899177}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.499455930359086, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c12b8192d0d0e8251714e96ca569143ed5c1244a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5108813928182807, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663061261117746}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5108813928182807, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663061261117746}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..912e8b7a245b117b33a464e5172f5237444a76ce
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5217627856365615, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011654768618560072}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5217627856365615, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011654768618560072}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..af67b5951904e578ac33d07afee5a4b66b0a70df
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5119695321001088, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662480968070071}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5119695321001088, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662480968070071}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bec0a3e25963d3df7f8603cf2b400f506e3381d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.500544069640914, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899171}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.500544069640914, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899171}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..240a44d027b0e4f6c14750f82a633f52de6ef904
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5620239390642002, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011575720065594108}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5647442872687704, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156760858875942}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d23ae9fce0d8d107600786aba308ca21e4c76457
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5565832426550599, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01159088337366686}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5516866158868335, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011603326108334509}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7856c51cdc91a82d228bf35c49f69a401faf854
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5652883569096845, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011565943814308853}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5631120783460283, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011572517929968272}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8219a2aed4495c68739e27e4a7767882790be72b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5505984766050055, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01160593662415608}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5500544069640914, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01160722083798011}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fdd90b30243068d8ec52675db1dbf4bb1879a31
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5522306855277476, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01160199979686681}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5538628944504896, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011597936590301236}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1426476fa879064ea3f1f735a4a5ba8c1a44780
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5576713819368879, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011587963545507183}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5565832426550599, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011590883373666858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d953171d74c1522f433742de3e3f4878d580486
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.577, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01563058909047635}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.503, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015819015179246724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e327ee08a948f5e8a920c222890766c3da3f1be
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.639, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015195720118175124}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.633, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015249378464171747}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae08b794c2ab54a46043dcb9fab73d59556209e7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.643, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015158521721486774}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.647, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015120172605483697}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fbbf1dde91a4330e15365c0168373f9de820a36
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.645, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015139491543780532}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.66, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8103d63959dd25b18b2c6c6759d892912dec18e8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.643, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015158521721486773}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.655, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..79b16bef769df52df33c5eb4bd4367fa49d7ebc3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.644, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015149042659306626}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.659, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5830bf373454c8da25996cb15ff8d3d22d7aaaef
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.832, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01182860583145426}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.757, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013569640199177458}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bc62fca0d17bb87a6fd776366a2c1de78333234
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.884, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.010131468138756995}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.875, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.010463483381956722}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ee1dcaef5de5af108053764b4fc93312927ac38
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.896, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009658016218524286}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.897, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009616833339695803}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dd441a68458c09df66f280711f6cde6169a6599
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.893, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009779910359847165}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.901, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009449248027662758}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ce93adc3757a2fc5e1341e68855e4df91ae2c1f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.904, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009320454434783227}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.904, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009320454434783243}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a90c52d44d490aa515d8f21ae18def453430dc2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Direct-Question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.906, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009233052000787745}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.913, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008916866630745873}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce9dabd9e3892f0ae44200654632d42624bb1b16
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.317, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01472167543888022}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.337, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014955087918653603}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d1c2173032408ded5a267bef407ce5aec72af5c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.346, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015050266127564443}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.348, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015070604603768408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f101058569b866e98a1ddbd37ae5d5171b9fe3c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.361, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01519572011817512}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.365, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0152317762262649}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d2b346df8aff55d39ef8e3598c771c5c3edbb05
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.367, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015249378464171749}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.376, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015325105508898125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..63431518db30befb9c387bf2de128c23aaddbfaf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.365, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015231776226264893}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.379, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015349091002225347}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c5ae2ae2377394bbff1e50048009f6c3f52eebb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.357, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015158521721486762}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.373, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015300493622922805}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..921b85b9f7330b5d754a8da3cb52b552825882d1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.309, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014619600977206486}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.321, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934642}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..550c0ebaae25b984ef0df0c26e1ae9e83d580094
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.35, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015090650341444231}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.343, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c34040a9426b1186f60ad9dd3fc1eb41a86df9e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.38, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015356947477797585}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.397, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015480007449307987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..feca246bb962c607888b398b223413866e863b72
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.417, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015599819048769618}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.407, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015543249100255545}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ec9ebedd688146f75f12376732976595f5a9b3a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015499685165842589}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.408, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015549205052920675}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d6e66c47b409240b8c52678317b0112a7e5706a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.368, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015258073561521805}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.388, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015417317979911072}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb6d78044ef0a701e1c71d6a0a6937b1af92bf13
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.32, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574886}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.333, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4670e1e81931198bda85b12fb52af188352aa93
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.324, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738863}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.342, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121731}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e72856b273c68538d70f09c1a17225436d0dbfab
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.321, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01477082181793464}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.328, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d1feb4dc87aca0cd2b753ccef8e852f19812fe4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.323, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348639}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.326, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6045a6ba6881876c046f6fe8e30b4e6feebacedf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.331, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203934}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.342, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121728}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d940d100c1e2f4075736d17429a9fdf33852f3c9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.307, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014593284892852634}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.326, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541024}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..83b9be69b48c5dd325c9687d17e0e6480b02e063
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.5093532870122929, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011560409019420364}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5243185462319615, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548748301487312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3596a0f81d07ae86a760fce8dcae6c016516056
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4692677712453234, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011540570846495547}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.48476750400855156, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01155706536834828}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e82e97f1ef84edc465f0c78f0ae0c3d41df5ef0
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4569748797434527, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011519544865928065}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.47835382148583644, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551591851683337}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca69a836a4a93f9d58796f1bcfc8d0347537e1bc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011524715486240652}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.46766435061464456, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011538227692217271}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..62bf071e22803d61575d40a4a459293accc84215
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011530479981182624}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011536599118298163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..539718fe54fe4024f432c73df87956f515d6bbf6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.45911277391769106, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011523708060182089}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4633885622661678, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153139408454962}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cecf8ca9058e1415e86a653a19888b92002bf28
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.518439337252806, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011554566910658103}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5360769641902726, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153229486915312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a16c66da770e64f3a2db567ff830e3eeab8b6b00
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.484233030464992, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011556682042196382}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.504008551576697, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562060664045727}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..53bcfae7aee0d85828ecc132bae5ab8f9a35f03d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4740780331373597, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011546883081384896}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4927846071619455, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561228264646724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f7c5692f6506e5a8a031da5b46b1e57d9257b24
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.47728487439871725, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011550494192008943}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01154623481377739}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..01736896b2fb0d65a9046c84f3e0a19a2250f59b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011546234813777397}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.47888829502939606, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011552120807053822}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..59cd3dab3617de17f31cf378437535db220eaecb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011536599118298168}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011539022035111222}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e54f32eb508b19b43c67fe7fee57590e326df413
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfa75099c7184d4df2bd7f62623db6afa45ca4b5
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..17731952da60460179483c210c762d72f9c8de43
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..05c3de9b601968d2abe64c883faaeb5929203ccf
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc0786b54223c344f03efc034e6eaf4f95486c34
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a5a061efdc8e0df8df5ba5bd8e6f8edb4203e39
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f40e5b4b58e37eba2b9f9925ed4126f8fb2a55dc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.498663816141101, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562390964658758}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5259219668626403, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546883081384905}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2f953b15daa174e79a073f70097b40754437ee7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.46766435061464456, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011538227692217271}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4949225013361839, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561836054238776}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..92794c8128fb260fb5488665fcddebba159728c9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.467129877071085, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011537420054210294}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4746125066809193, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011547518083754583}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0ddd5758f703daa44a7db6bedc1d54f677431e7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.45537145911277394, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011516282203726655}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a92a788ccf8aeb116e1d0a3ca29253d049f0bc77
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.45537145911277394, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011516282203726655}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4564404061998931, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011518470676766505}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2f30cf224cf96dda300ef19d6f166fb227d2c2f
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011530479981182626}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4607161945483699, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011526690316014585}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1f7aee0ca59b315079aee3f0fe72eaa8e6faec7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.5061464457509354, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561558589040751}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5339390700160342, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011535764881641411}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b9f4ae2833937fc76c01821837a5c96456016d2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4879743452699091, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559087533800682}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5045430251202565, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561954965856519}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..36cbec44aa8e7348784511dec60ebfa86b3c3c52
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.48102618920363444, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011554104174019692}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4831640833778728, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011555875693960778}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..063bf2ff99795468a557bc21fa6f5736d7dbb828
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47621592731159806, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011549343521088362}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153902203511122}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f08ac944105d06d4414216e26cc2765f81055a59
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47247461250668094, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544898473864588}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.47621592731159806, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011549343521088362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6bd56e20406c28006bad6253848930683f75c258
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011543509045585203}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4708711918760021, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011542794417345716}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a8eafe7fe90abb9266cbb3d83b70d56f933cbe8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1bbba9dbec61603c8018dc8b874a1d869b878fc
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366422}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2db9a46df3610bdad6f9e4efeb9413abcba10ab
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.516245487364621, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b56e4ca8416b8b3230b106c3b2d83d82a220b9b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030063300411902652}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..533cc8426feec54e7869003429e5b4111f5214b1
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dbbc7c7ecd338ec728f02d3af1b0d9680b0c671
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..908c11b48e39a5051876a3a3d4193eade62aad38
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b262448f705316189c0bcd11e7c382321f1f248d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fb868f7e3bcbd60969452a433afb663633167c5
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..13156fcaee95e38f1b4fe88c1d4cfd4c83c0b7d9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030063300411902652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8655fdc0c483ae1abc510b180310ba0ed712b6fe
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfc1bcf9cb5a21f9a2bb05cfecb74e3f66fe9c98
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7fe1b0b9232c5194bb9c7bf3eef6b571d72b05a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5910fb0e6b35a653e895c0457a8a446042e31f4d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb8c4d0783932d253560723cbd08c54d933a6bc3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0300727231673172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..916099e097c3a303b4867605b7c320bf6d1356b4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6199a272a7805f6d956227928435dff40107c43d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dfb139cb1f90e01ef5cf9f83cd9377778f3b722
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d438a0238516cfffe24d9e685f9d677e1d678c4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6526918901f89e3f6c63810734105ff7bc4beb0
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e8d16c8e2ef9b80f75b3102914649f5676849f4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d6ed21593e9a1113d1bd6aaebaa9fab445c5430
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c998d1530afefd6e014a8107c3f02b5789e84c29
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5451263537906137, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029973636495415252}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..888aee41529c07ff0c6e9aa3c681066b767d2751
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c0bac81a2ade8a147584eabc58bfc913be20e87
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4007ffc237fd698d44c1d02a8557e2b0938c0835
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..30a8285d2b39900a1129e7998f4546d1f9d98496
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331327}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f75c41e4deeff94177c3fe15a79516f325493172
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..80d09a995066f8463447cd84253cafd18f95dea3
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030063300411902652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b8b6e8aeeab9eeaa17d314a28e555e5f6f09f0c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..06fecaed2cb739cd2a9dd904aa4052bdcb2b63fb
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5193370165745856, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014041972733712977}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5236edb16f342a21f3c8a5efd9462ea5210f608
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978594}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045826789783658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a74cb2ab42a8c8dcf9a8e034b87f101704efea9
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7439941ced8fc032eb355ef78037606da7ed3ce
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978596}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405174596179051}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f0ab1511f7ac4e61f19b98685e4c09d020658a6
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529019}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c42db7a691bb684d124f387ab3995052861ae10
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_Replace_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5177584846093133, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014043619596174962}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..293ae8cb667758326d21e58e134bb62572aa976a
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4940805051302289, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..755873d79351ca021f0f2c75ba0cb48c624d7ece
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4980268350434096, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052376259225632}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.48539857932123126, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014046492383275834}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..43f4fed691ec79791b97db7bab03e8273bdc047b
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5019731649565904, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052376259225629}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4846093133385951, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014045826789783656}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5698bdd62a0ca1e230d8326afa815a44e7983dee
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5146014206787688, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014046492383275832}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.516179952644041, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014045126130978601}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d6e477b1781d50ff151bac3f1ae8a6edcdcfa1d
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.510655090765588, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014049294536290396}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014048278820405621}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d830c9bf57834da2754221e831e1a7ef2fcc880e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_True-or-False_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5035516969218626, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052131146915864}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.494869771112865, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051745961790516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..32354c7c8e06cca1964ebee75a8e24bb88d1d24e
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140492945362904}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4846093133385951, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045826789783666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2473a08a22de2625fa916249cca248030bd1b2
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052376259225632}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..37260c7e1fcd8f42209a08b8a87a883e93c3263c
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140492945362904}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.47987371744277824, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014041096664344327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b10a5ca61a35bc6524c1798eac523450523ff20
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052376259225636}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48303078137332284, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014044390401612972}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..db36b1bbd0e04c066c39254def268208dd4a3fe4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.48224151539068666, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404361959617496}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48697711128650356, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8557fc42319917670233b7dfc9f6b8bb0acfa26
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616452}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6218641f63b7c4e288779f0fc02405992c2eccb7
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440415}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7a46858941ead53e7f87745ed1bd67bf0927a42
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049512}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b86ac6f2ae725bbd0afb4876e166d5269398cab
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5185477505919495, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014042813708888378}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047122916440422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..79884b582ffb1c7f74c92ba8d2be77db4bbfd173
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405090552122858}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b0e5bee74b8ebf312ed44bbf7ef12d1a944dea8
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5217048145224941, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01403923921648463}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5114443567482242, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048804199859325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..99b8873c09ad1f9ddf5982085478ddde3e47c854
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_stand-for_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051220692330349}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_0.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d10a46157deb4bbd6721b049ae90b96aab3d711
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4861878453038674, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440415}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_1.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9e660396a096c3b8d63b89a26641c9f50675611
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228577}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915853}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_2.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfb8b70a18bb8c7a447a551715ed69f167580286
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915867}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_3.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..49b460c69d5b6aa627d102e79244dd3a43575013
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5272296764009471, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014031631629827703}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045826789783672}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_4.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f968385b5d2561637776276bbb072af5aba15183
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978601}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_5.json b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..246b0b37486b72428c1dcfe4db19704752772be4
--- /dev/null
+++ b/4b284b12boscar/eval/agg.4b284b12boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045826789783666}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047122916440419}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2ae447fd8531d18e58fd9a8d78cb7b67bf8345e7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbacecc766801b2b050973c4a9134f731c6dc50d8ca582c1fd8ad86825770fb9
+size 3911210
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45164486760175f50ab10954ab9f44558cc6326a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00df347126c96d365c98ea889108ef1b681d56798e7de2ab0732c88f63c73daf
+size 4712492
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..afeea1a8db5f7511b1d721468336afb045f18dca
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd66a717a4e76ac3a214a9f5eae77749694c97cb5ef8169989edad6d2c2f14ce
+size 16732227
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bdbe7f5d113c03aeee0372c6594ac822e715e419
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b0f4db09871e82b40875fe8451a4611a4d4bcfeec1d25f180f45ad9ec94bc19
+size 19420959
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b4f5353b68416ef39f0bd38ac4659292b00b29d8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:043fcad768c0fc64632b8d7de71f5a2aaa32bb9ae53223d9082af581b57dcdf8
+size 14708988
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0c64700af42078ba4b48ede5e73cd3e520300ccc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d6cc25d93e866d0c3730c6570732b41b03cdd72fa64e2e6e123a8bd889363a4
+size 8229785
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f225ca63a70093721979289019bf692fc8ba1f49
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e4402fe9518ab805f4256dafac7ab38f341b9df08b2047ac3b81a52277b151a
+size 4483514
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a32b7da7bff2f4e45e5a77e5842aa11343920665
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32a2642467d86285c2791b99206c6613a2fe2419da0393e4ff5744eeceaec3d7
+size 3383780
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..52756012cff8e76088dc08723c2bec02e8e20b0b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03b882dcc6fab8961240973a09e86c6adef6d7aae24d1ae3e8c6d7602c55cdc7
+size 8114076
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6847f74c880590be0531a5cec486f6aeb30a10d6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:998f12ba2c2783390cdeead501ec764df64004055ac90da4ee0869ac6624917d
+size 9557248
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4df9f9dfbbdec3ee4e2a856399f3966fe4534d47
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:702375ab61e1a9fdd8bee7aa852f564da79ac041d6afcdfdde495660ff5ca37c
+size 11010674
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bc1ac5fb011f23eea786a21e99ce0a6bc6b5dfac
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e12daedffb8497256b35f0e1360ff7e5c38daafd43eeccfa837936f84e6eb27c
+size 6234220
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2cc8d971c5e661c14c02c510b312909517cb19f6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68297bc7254cad8a37775288d5dcb3fbecc6b3c30b68cc69b9d9cb27ab9b59e9
+size 2620478
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a55cb25370c8f846e2d7c964ac084f2cde07760
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46f65396d9664ef59ab65421d6d27380f55bed70f70aa40d2e7435bfbd5e0fc
+size 3263961
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9e0fbc8be854e86f84ac395a8e3df9a1c5dbc2a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c277f1d102fe39c28eae92b2e8a411c3badcce9f3cf46cec1651d05d46627999
+size 11559102
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f2bcba11df6cc9addc7ca8f0a7c8a74cbf3b44a1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:313b9994aaa80219157c7cfefa419bdf85fd0d1df8d0d7c3a4ab4a3ae616a4c9
+size 8942412
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..926c7df404c22e44a5442bf0e2446fce9e1cc165
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a67ef94f9cfbad9712ba1187f5c7c82aa2b35264894b7fb7b06fa4657999fbe
+size 10211878
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e639e7552f026dbe3ae39e271e856e0c2539a72
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f38ca37d1bcd950fb211faed9961b3fd492f17170872f8c0ebf0e56692a263dd
+size 5760119
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c2ac62d56d644b02877873c1e711af8be1c123c2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70fa9b3461860c3d4818e3fe0b91dac2a50ddd43bb17fc59d86c68cba9652c38
+size 3852945
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e105b21149f29873842c73d12b8276cccd08eb85
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89435f188f50437a4a1065f688d2ccaf5d9e4e818d8a4e1f5e8b4683d109417b
+size 3846262
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6719af5544c0f86fe6f5ae5f4d343934f653979f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02bb06a401190d8c6882b93e9784671d605e2ec4a4c619224d93884eae3b1c60
+size 9227210
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..31e265fef737afabc17a6ecd890eb38478a96073
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6ec219401942bf47fcccac21980618fa0bed431ac6689a2d5f4139c1b30babf
+size 10914234
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..091614e9fb08d6b4c2238dfeefbe3a173dd757aa
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8eb2f51eb9680e73621925084b324add7088bd3261bb870a04dbf56aaae247f1
+size 12632550
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..78fb60ec10eb02c1fc3db0c514742fe42f49eca4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:627a1875691263a1f7259f4eba7d921456510256903d6929c37c9bbc93cd7abd
+size 7181367
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..78918939fb4e54a400dbf6a8059cf192644ac388
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e558de615cb2bbc432cd5cca1c85acce4670b9a93118ef1d9e19bb602c57e00c
+size 5435469
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..156332f0276db4eabbc250c8aa4cbd808991b8e7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d9af240ea49f1a5a77b306b37ab03878f1c7a2b2e1cbdf207dbd91308a42a4c
+size 5207219
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4be0fad7e6a3915a50c95dd39b147dd5fa31ac5a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e3f1dd63bec365b93a4fa07ad72d925662731e737303a82965ad1663a06c89d
+size 19351791
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..75bdc4c150ee09dcfeb71697ce8dec962c4b5547
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6157bf4ffa654e2a470231c0eeef1ae5ee15ca1ff4693709983c8cce17226b91
+size 15522984
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..be8cffda8a04e0372dd2a738923dde6474e64881
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:186c67ada95af4a18f0a2579ddb38f1dc42aacfe2117fda9c594cf0e764b376a
+size 18141514
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..347b518372818e7582321491f9f1f4f2e7b33054
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:272484523feef5e9dc6f2070ba5fa859ab1c5c82f32e3646f958ea4608d68faf
+size 10410849
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d798539cf5924a0668614db68bb407cc8f185d4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d704f376b2d66270cea50e2e464e0fcdc9a2370af68f214bdd8b8ae6360bf46f
+size 7794879
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..48bb2699b5363bb1d9e7d75e24ff2b7366f6743c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe703b969468a80646a73f4fd3c4599fc5b1a469790242b2ffeb97e137659808
+size 13511123
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a23b2c9d6211897e3375d07d78e3146ed93ac42c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5781615d6857808d3969fbd53ebe5e26ae25bf05c467a67f5b041f0bbc4a6793
+size 57381630
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..88526cc39fe6bb611efa23fc97d32ab152ffacbe
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0ad53b64a2b82a2b2ab75ad97e7ba8053e8b6c941d321f23edfe6017875c68b
+size 73762584
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbd7838313ed9113bd3a854cd8085a51b3c611ff
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f02fc9187ca7f0b4f56ecefa151d3353752dfd1b49f91995f8a251d4b2bde724
+size 59693456
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f149ddb0d99db832836cb2b6de1c5f81f49001b7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a8f8cd69b629bbe685bbf5b75794fca57f1fdb6a585427d14237ee69abdf55a
+size 35284853
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..63b7b45f59b2dff900cd326b9fc3957233ea19b6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5c8c65dedbd0add868c06b450b8fafd2230d9fbb40a5520a384da92689fba4
+size 7737736
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a179e3df670cdaabf08972004fc3c82a7916b07b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d560eb7fdb01bb3649c41c03d7495263e18bc7cc8f64695e8a3a5e4244341584
+size 13318650
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..37867f81a753f2516c64b8f15bd843815c4cb507
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e316baeac5c7fb0c3c2db31b733b31e5c75b341431f7e2968e740b7e35ca938
+size 56907219
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2879dafb3838b82e4eec0d3f3fbe5e52a3fcdfbc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9f1117dc00fef7d8ecd13a64b15cfd0da41dc362f65e681595d0edce11c71b4
+size 73493280
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..15e135b52d7d20f613c89dced0471946b8cf8a21
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cd4a2b8ed63c6d432665a0b92b4578045da3b98a2b83ef032cd1031b03ee06e
+size 59821758
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..800a121e55b4535eaf3a803dc1de59ec080bc7c6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:328c0c964b2c386223a03c1c635453f408877306ce6b505ab7396c099e7b7747
+size 35421935
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9cbb871270665aac1dc8f5b290a5dd76eb29f16e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:375be8a3fedcc3de3c2966180cffa322ffb78934e00ffdbdd79928b3e05a6edd
+size 7794453
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c337354978b504b7436531ee42d6436d5d77a5a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb8a5ed56e75874412569af5cb784e8788d1289b907749995e867b1aa787a920
+size 13397920
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c25a785f8e02a0b1279a5767bb5cc69303438212
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:404346b19f16199e24c06b440e19e8807d13396f2cf28c92ac07668e9fb2ef4b
+size 57238932
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d2d402fe63cfa3fa9aab965a26537fb46e458d90
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bfbb5069a45038aa59fabbd4707ec64eb68d9a18874dfe5c38e0d5e844ede64
+size 73842762
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5319cb265d94abea97aff76472dfdf9bec413c38
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8111879681395f7cd70d40a96d2797e2da20f79dd1da7e44b0ff8c35a97fd714
+size 60070546
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fbf78c9cb32e22b87667c80c50ab1d5ecf8ae91f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a326e189f182e2a3e21c5ed0d1e552d0f632f2e5ee9a31159e2991dad6ad8a30
+size 35569228
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b834e86bcee0b6151a64dfa11cc7a33fe131168a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71736a362d2b6e298801b68b3a0727c5d54dd0adf5f8dd7c1adac7217f0e6de9
+size 7367962
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d6e97517a22537797010256516f3af49193e798
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eaf0678f962af0e770dc6ac9d45291f7670d64e1e6dde48269d2af658b3db26b
+size 13028196
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0be5462ea4acb9c05cbcce18b40a49e9febd05cd
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a04f7e6ecffe431b63e029aa60106ff3a586ee3b86ccf9ca1203486f66cdd17e
+size 55741794
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c75fbfe6a55bf18cd2e0c1eb9809ecc75db6c6f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc7d0cdbae0f0bde6165b47e0cfa83c80c0f6bc685eac3f3b133b5af57a237d7
+size 72114006
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..006f86160d03e8aa6cd3226ff2cef0cd12beb22b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b772d6d21c8160edf2dcab23b712b96a2964c05137379c2187e9689967a2757
+size 58741868
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d38d87ca139f1b082bd98fa33c366c4b4e013b2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad3e7bc4df3222c676554c7a50478d403eadd90565d124cb24f8c6b5059bfe15
+size 34782928
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..917fceeb290e911f8a50b04f3cda8617de23bfb4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8726fe09e1bf6fa6d28dd1c079c1df2d9ff9b709f4c2ec18f63fd8e535b27da4
+size 8066991
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..93e9d1af7d50f0178990267bd20d7172b7d36807
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d7a1bf0cf1b36047f728185e37c1c3ef932c36528cba238d1dbfd5a1a3d4add
+size 13786692
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..835ab8607e5eba1e722d19d5edda19caddc61724
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:285c02bb27335ce3a5e8d9ec0249b997a1fdde16d51ffce246a91d879bdef83f
+size 59220795
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b23fe97f788642a6c95921ba1af6c960fc0b9e3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a61054eeb828d959bdd13890b2d0c18ce5004f1c10f114031e07caa6ffff9aa
+size 76088949
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..922f9418b00512e32fca9ff7308415298a1b5a8d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1ba261702e826257a4c8a0a57f1a2d4f574b32a4b4a0ca8383e49a8c1b7ecb
+size 61653700
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5a274318ac8ca1fc7429aae65c8128dbae034e6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab1fcbfc5da0219da67be84a1b9ff47bcfd4d662a69ee6795c0fbc212406849e
+size 36467638
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68389a3f6bb9a5e0d7e339539c1355a3895ee39a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f7e332bc62cc5208330c44778da8f99b72b9abb28d4c479feda3f832b7bedb
+size 993116
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe9f1c1834fa14bec918d612f2d8861a20dfba
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c16fc0d77bca59cc8d57e666b2533ffb721cccb69cb1bbf2c9d4a387f669ba7b
+size 1451214
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..636cb09bb600529774b38e0af8e3ff048964becb
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2979a2522e4e8c00857d73450918759712ad0b19f35cab057c0e2ccea84e6afa
+size 1909992
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..843e118a5cca57d46058edce310cf5fee40261d5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:800d89b179a32d049dc843a43c13857dff377d9bb535eec9514ef87903751d25
+size 2367298
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2dda5b1565908792fce68796e75bfb3831ceebfb
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19a90aa3b53e26b226d85f679d104092c10316c18ff0660497884aa78805761f
+size 2821664
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39b05aa84dd367234ca294c2ca1373c646675ecc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f89e4e012a5cc4dae7e9013a00b52822b659951a4f0cc2b0243b3e72717f4f8
+size 3277958
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b47cdf61754b22a90508d43e0f624b1c27134fe4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d70a4334b00831fd4c66ab43b9e5d0779c9ba425b1c44d110124e343f5c834b
+size 1203117
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1ce878e0baca92bdffc5e1e95712a14e4725e7e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:701bf2d17666b683084b60e3615d0beb15979b4cea0af420cfaf229c978efdd9
+size 1755006
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..395b0a1713e7b5fecda080a1b403be53f475b5a3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ecbc4b5ce950f792827de5f8000d697a0c84f957593dd8e1be72ed0f4857c70
+size 2303922
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07f9481d10b49db31d9af9a755027fa721ee91a1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:477664f3751da427ce7b515b42a8bafe03a7642c81d85642a081197f7dc93ed4
+size 2852135
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e84a1adeb3c7060a013ca46edbfd150d92dc7d74
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3327379403eab9653d03eb4b903dd587dd39fda15e87dcf845a172ddefd5aa3b
+size 3397672
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c4c4457d56d07290542fc4edc278da3fc6780905
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e975e55db63b0d27c519eda28e952c2fa6162cefb5af3d1e78b84782b1658053
+size 7890314
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b8b9ef90db3b093426c1072e3961087e747d1d1c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7b1fbba10a7d94e2066d1c61c5005fd59dee1747e19fe4ba0ca46170de044e2
+size 1008023
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91
+size 1478640
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4e98f42798746e91047e11f8175fc75b6d42040b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ba05459309443195b226abd9c14198bfe646e7092a9a3f163ff566fab042e56
+size 1949533
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..72216095776b9152438097dc85ae6aad1eee632b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9beec26ceae59e09985c774d55fda96668adaea7c59283b773feb95dd47e819
+size 2419043
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e74f2a38401559bca3a29d12c98575d450f6144
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1bf86bc5728a1aa0f38f51d7e133e5f1aab3013714a2a4d47b9c537a6efb8cd
+size 2885395
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbafd73cfbf3b8bcce04d4f2870461b722ff53e7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffcb6a6f98761982181e13175eaa8d80bbc00818d6a89e5fb020d54fbc2ab576
+size 3353706
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d2ed6c2d70b8a9802a291c7f6a23b3eaa7ed31d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:734083a79996c631149d4dd8245a00cd486325bd2a084311d3f99c2fda44f2a1
+size 1160116
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f768ee8fddec93983f46091e5f1a0a0b71c154f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e7e27b11293b009ebbc1a4d09027b315025b76a1f8d7025f29860086116a685
+size 1668501
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba9dc1537bf4388fb45204a18e6686483097353c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:083b71edecaa88a72366dfd99b5e54f668e35e8bfbfda5062c10cc041b9e7107
+size 2176755
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fa1e6188a8174da1cf20acd946e492aac657a083
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c6d0b5fd6a898ac567efef87e9d1e345d23a4ddf48a83f8de9d9739cdeaf7af
+size 2683443
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bebdaffaf0ddf5fa4b64a58c8cb2ef71a6ade747
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebfae25f1c1ef6f1f16cb5dfd8c2a5dec668c9395a69f775872783f77a1190a6
+size 3187394
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a671d0b3a4fa409442709df5bef41fe65e4e629a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:873857b85c1a7c2b3a11260d787349e27a3919af4c908ebfbc5c1771acce7fda
+size 7386774
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0aaf2a922bc1b2250dd0b330d36523da86ec7814
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f4ab2e30bbf20d168987d2946705a00988192e44df23f0c6e54d2469ab4f3e5
+size 1027035
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f1b2f7514c28b3d0e60f7619687b86e35612ea5e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd21b3162cacfa57437728e6abdd185de9efb3bad975aa59294b76c811a384f
+size 1503640
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c23e5dcd2c3ef5dc5d1cf42f67b511b4dcaabfd7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adcc9465e10e3cdaf8be5d1a4cb95209291b1c783206639ff49c44759fa3b518
+size 1980274
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7977180c928cf7979edcfaa781b6c8d611372773
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6405accdb4be39cf31369c790fa5aacdbb0c63eb359df1dcd8ed8ca3a3c48feb
+size 2455566
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2c05fa836eeedf035afb1d6a972264d2c80cecb4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdaf36ad611c277e67adeef09c1ec60067ce5b13532d1f3cec610948f9971625
+size 2928084
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e57f54877ad0f03c70eff2ed4e9f6fcc5a638b26
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r1_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebbffc395b488fd42bb6f189c552caf9e73e655f0e4560a919aa5ad2a4ac3e25
+size 3402324
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cdad97953d7101e2bc3dbfb1d9b3fce92dfa00a0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c367721cd8294b817e9e80eb65b2f2b5b77ffea132f1aadefc954a2dee49a5d
+size 993515
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f034f01d62bd87563a3d186457849175ff77cbb2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:101f6ab034dd2f2f75fd05386008fda441ea0bb07195ac9918bcc3be789830d1
+size 1446420
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6206926ccf944adaf9e1342a5961613b8402b3c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0ae97b6cfb4cdfe330227f875cd9cf670070b57b19123878574f72f03c8ace2
+size 1899922
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..289490f88b3ad586db5a4edc146471ab16ac90f1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d083e6df6738c46f58913260e9ac3acfea7267a1f54deee2f8fb5cf40aa5794
+size 2349784
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..33218353a356e9a0ed85433120046a762b573faa
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ffe152bcf9e49b71e47f2d1fd027352bc706094455d653a303395f0ea847e26
+size 2799499
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..859f84769802d77e3862954700e1d973f40c15fa
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7022e373cd5215ebdda9aa927d8ea83d34ddf6d19949cc2f878c6aaceb828b48
+size 3250115
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f03c65dee4395163bb3778c2312bda19f5aa2a19
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb7fe6730e202145022b14d5919b4acc5fea5d6ebac52460e2e1317ff489acb
+size 1203519
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79b9effc40be6f7a0ec61081d035d801ea9e90ac
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6600d531936e7ef70031daa10256ab0157ee2e4db6c4008c3be3cb0877b09e23
+size 1750064
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7337ea058ce112ae3d134ba0587afc39666e81ba
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b1736d243b7a246cd0f1a78a43bfa73f22156961218603299f8d5d6f7d04ea4
+size 2293928
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..afddda4f92a92f280e4426119daa7e8976437408
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f2d1017eb19cecb18748992046fe8b3326731f94edea2145f884a1e70e39278
+size 2834596
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8941be234d40c1dace8d224e9c91b3c01df4fa18
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e26a28abb60e447099e3a11c70534c1315bfa328eacae85092ac84a8614d5ba4
+size 3375517
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..939f214dccab0644e5887867ae7f9669d66e148e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fb62a99263b0886c49429dcc2594187a0c28a00cf2e3a7be713db06a7aff207
+size 7834644
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..31628aff0e1ef2e56464e730cd2f47bfff42ad0f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1e9c6b1f82930386b1476531d8f772ec291336c3db46398b5b756db9238a90f
+size 1008426
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5
+size 1474064
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1b3fb4a1934bf415fa04b3088937b06ea0cd1854
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e93ce5c3c22743be34721525fd52aa618fb8e6dd7c326d3c5ada1e76b0db2624
+size 1939827
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8e137a6ceb835c1281e4eb76bea74c23db7db84
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8f81bf001606dcac374858150bf5b40d6ea84d8a1026d7436572830a3a40fba
+size 2401792
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14249d1fa1a0980224b1b45851be9fcc5e9f1dc8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77b28739df68183f0d4541ff136c13b54b89771d336be358fdcfa0b6d1fe9fa3
+size 2863548
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8035aeb4fc40cc753c0583dc8fc44b2858e3dd53
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4bfa7b1ba4c4e9c9ba77d2e4e3a45d92467334aca50a4eb19a94c84a6cd7622
+size 3326229
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2c93ca38a07925aa8b9be6806ee2f22c5ad159d8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a85d3797a1268027e74223cb1b063c711a46c5f93a6c5e3c03f8d02e3abc2fd7
+size 1160518
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..32e0ce5602eb33e32ef87446daf417a0ce78cff2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb7aee43fce23e10a8ebce9c18bdcbdfe665e0b16d86db584c9d8addca14482
+size 1663567
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68b956c2ef1f120e48a024074e019ad05a91dfc6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07f261d15a56574da808aa38d43a5fda111af1d37d09a88fe779777bb615f9a2
+size 2166304
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..29000f6f6c3e850eff565dff1177ef464991554f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd601709f967c65c5809423fa7fab76786d60e841a55f27eaf01e384d13ebc37
+size 2665395
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..038b5ae6ce103bc71a0e40e608348fc706516671
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:454c39a866403ea5f5a611ad32af7b94ebbc39ce7c3f3361479d5875e360bde6
+size 3164435
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb0a96356ef92c88e1750a2e37e4eedc1294e05e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de0b3a3244b5d75e4a0ff3b7608bbc219ac42e1f5b7bdfab27eeb70580fe1226
+size 7329324
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba0dc2fdbd88e35b913e35fa5fc57971da1b9f10
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:820a6203b957994e6f58abccd2c95117f5627dfde8fec446c63f4426c8b0113c
+size 1027452
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b
+size 1499064
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0aef102913d8ac4e38e486150e3a1bf6e36ffcac
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d44719c74d3328f8313bd3cef432fb56ab3baf64b32fbd1196967ea693b674c
+size 1970515
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79ac6d8d2006956a30125e5ceddc56de61da9e00
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:048812042db681f1328e89ddc36a07fb9c0dae321aaee193b113e8d12d6f541f
+size 2438387
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8895651c32447bd40281b02404ad0333a1b91c43
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ba92af63a10f2cb2b3bdc580179e975d0be9b36e836ee324ab7ce15fbec7e30
+size 2906167
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..84f10ba022cf58e506557d16238f6fac23c3f119
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r2_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73d06a34ebb300ccdc81b3d35916358ea50d5ae4b6015a0f37d457eeb2de6e70
+size 3374861
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba613de5efc3324c1c9fe5e0786f8d010bea55af
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec633ef1e2bdc4cbc15d8dac77d9875879f4f33adca4292083e621f15d2c1f7e
+size 1169158
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..21cb98286293cd97160b485ee435909f8aa13748
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbc7dd6bdc78fe539d7938f0fb3a7909cf4a8e5c35e021e68d50a14af4969c27
+size 1698054
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0c3ae8d02e7f0c2bcbc1ae48bbe7b6165e4255f4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b73fb0e0df87a6c668efee738fce702df47b2da159700ba9ec382074971c4feb
+size 2216978
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f780d2611e5ae088291b846af717a5b93bfc9e8b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f15eb62ba34e5b559069e1d4043ad80fbc14c97d4b0c9f3a1253826ec5989e3
+size 2730319
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..722f7296cb222d5ce9dc479df57cdd57b869210d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ed39dc16e9e6e740e1422f8357d4d14bcea9e611b0318bb6d5eabe6a3d0f23
+size 3247800
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e4c391d1abd6e8be135340c454b67dac0988f1d5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98722b748878681bac6efac326165e1dae2c24155f9dc78d0d78815dbc1dd7ab
+size 7552630
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a547bf38677274d62d74173d99d4fc5d2031abb5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e4e4bdd5f0c1f6769bf3f99e8cea228023c47c8822bc923d617735e9cee9402
+size 1421159
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe3c0c77f0fd332fb2b9aebbc2f2949d79b9d340
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8436189e45f167b68f379e1b616cda810a1807bc1f8dbcf03908bfc6beea0535
+size 2061859
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4cf16bfc37bae50f168dc637e9f7873d231e85da
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b339cdba33c464994f9b42246e515327feba93acf04378ba21280158f0119fab
+size 2689087
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..878f073a4f481f152a2676e06448e99ff9136e53
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ccff1f7b03c3ba9465132caf6263718d20d9f8cef3fef43c7b032d4a42120fa
+size 3311449
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6be5adf8b526902df9ce3e2cd5316defce94a408
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5b77d36ab14a32df865e69df465e77cee2fac7512e4246f221d6c2bf8826ec6
+size 3938259
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02327c84d5874903d07a0ec68b69a48a32f27e70
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f07694d531d528c2d7e6953bac09efe5bac4f8cc4784ccd7bbe95285d07e71d
+size 9152564
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fac251ec125e0a39d717a3afb3024c8898a368d6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6339d683a65003c6f01807433a014fda7a3b1cf8d2959615720a6afbf0cb4818
+size 1186854
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778
+size 1730743
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00fa181c7ce9af40bdbd9ef621c28b270407fb03
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45efcc92ab990b9804327550285bd8ac6c7a767ac201fe8e48d1216f3538b802
+size 2264461
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eff32eb7140dc6d725e7c01c530a1c6d3518b7a0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8e28112b3612cb94e59b1cd170e297df4d2ba86b0d17364e55bfeeabc8ccb08
+size 2792087
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..898ab7ccc14d5357b193f169cbd7cc0ef8a5ab29
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf57229f2923e203df884eb057be553c7fd79795711e192f3c3e37b191029bef
+size 3324029
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9ca1086b1fa60f00a4ed2a897f90594b8bdde5a1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49f7e3e4319c66153960cf05f6443777e40fc558cb7eb041d35259fcf4676de3
+size 7733912
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2de0e4e6a381755f1523599a3f276fae4cf02b22
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:891d73e60b3a77b2ae5b5adf1e50d08cc931df32c6df04abdc1cc63435ccd756
+size 1369789
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..147054955c9d1729f8e154482468df0b7793f788
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:675b746d78515c7c7d26d5a61a3112cb97f1a20c4533700edb55e83f0b8621c3
+size 1958897
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8ab2adad5375534d23931345649d2a27219994d7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:598f4d81ca8a8007091c0c044706c256c4dfaa7cb1c6a0e5ec51e041f212f8af
+size 2537196
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dab733c9707ea7454634117e20cb9b997471ef6e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46336801de8580dd8882abeae206f26451f5006abb54afec0f6f841f78534dff
+size 3110085
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4fd75f32cfacff082497891805a6ea228cb6a472
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:247893164e61b4ecc7207927d95afa471fa4d3e9c88d7d93a7943b688a4ee1a2
+size 3687318
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d70729f801ddb2dfc27fbe80ce630e83d2107078
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c41f3fe52194e388e23cd2fcae4346c4b40345b0880a5e3b5ba009b1942bb80c
+size 8551118
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3be4983c8013f0f074fa7ad61fb4dc8162832a18
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:296de5e508c4889a2e9c767333cd59e7204d313888091860c6b323bc4ad0416e
+size 1209708
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39933c2636ac39ed7b9cdb1c5a6051e44e115f74
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3f7d1d6d6548fc5354424fbd538b705ba9ce9f7bf9a237569f921608baf3021
+size 1760743
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79882c723d5b2a6f0c394a43875520a20cb7fa5f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d98deef3beb045c4125eefaf8e0212a2a8f6eb5f2728c183e58ca5256556573f
+size 2301392
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fce04491f53e410ff976ef2c922b1ea2aef54cd5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b94cd0c069a92058585691836aeed389da2774a18ed69cf257cb378f1ede60d5
+size 2836131
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c7ed42acd0ff1af0491539dc807708f80e46395
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f543e8c830b3bc11a27e697aebd47188f37064992ce9a1f9c437c052a36b2523
+size 3375290
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b5926a3d00c43e95df1c4a2c3096ec47a1ac732c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_anli_r3_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ff50e2170939eb0473a340f108cc0b543cafec8a7c134c2b9c82d7d91fedb4d
+size 7850716
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6067a16e97c1ca843797c19689dbdefd6dabf9c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84c97859cafdc69fd544fad7cb30422acc0b842357cef6f1ca7d05ed230b81bc
+size 1216777
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..138c603296aa6eadaf054861d69a4a9a97ec274d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6759d954881f59993712ba0e81ffb5ecc945b10442b4b031cac55b848b2e33c4
+size 1670632
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11124c59c4f88f5923062b0623e0fd2b2b196144
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b961e32f9134df4c446be18b3a66d357d8dc3bb4a005809c2ebe22fc3808d3b3
+size 2120084
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9d49efef4bcbc9af8d801d7ddce8ed10fd1c2bb9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa235e8d9b5b3da01167d128e8b4e88482a60e250577c57e38dd682eef7d400e
+size 2577615
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ad25d4ba1c6e2a1e3960cd05188e3467ecf0e05e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ab269cfd74dac2e8b890f448975c0440488d211ae52ad2d0d0f7b104787849f
+size 3027517
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8958d48c07782b41a0a8b7f6a936f9144831802b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d07c692ba2106c6e5c476f44bc7b412d23c845b82909125777be8e308aedee54
+size 3479076
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2937b97112b11732d3f58dfef11d1546926f2a72
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e99f15226cd987c3b31be142bd1ce55356963dffa6bbedcbce5351858538102
+size 1458507
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3687d9a44cf3fbdd0b1c0b2f432fae34a3c30d81
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70d28f62cf14d37b423c034efe339b902132ecb61b0a34e1eab704d0bd82b469
+size 1961266
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2574606f6c05e168ff58f3b8bbe82c5a561652ed
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b36698fe58a0a5d705c7b7fe745198130eb5bb0559fd3e28ed9649c5dcd0f0a0
+size 2458087
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..af91fe949c7a6eb3433070ee197da3eca3c6a5ff
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f53dca58f7f88300d45e254326ccb318da6cab4f617ab13a1237433fac18eca7
+size 2963892
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..52198cc37fdce35762a13b1fa30ee4d9baa6963c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2850849f28c190ca40f642e35a795cf32fddf4862821f001f5626b0bde111c1
+size 3461928
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d547efbe6265be6754e7b9268ae931ee8b88cbe8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99f252466a2bbdbeab60dfe5dff6f113fcfec66c59f3a865e78b81acccef7074
+size 3960893
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db62ba3ff404fadcc33bb73b110fb61c357122b0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25504957404f34761020354e077fac114544b344ead51746ce8a9949442f8aed
+size 1505923
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c5abf42a051b63970bd1f6626c61158dd630d810
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e40c018532d91710a740e85d22ce6b0a97703a75330d58b576a928fdbd44c91
+size 2032502
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5bd582daeea684c548030f6a0f21101698dc39bc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2862d80322ab7366463a9a62a99711730b90fe461c22efbc7e09f442a4558c50
+size 2553773
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8bbca733611b428a1db4ac71b28c61d63e6e403d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:160c06b9e9c868706843ba778b4d0db9c323b4d5c636709967be2f17346b6251
+size 3084257
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba1eb5e696cfce050e1afbe1afb9d6b14941da19
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca66cdd91404a8ecd6d32ee291be1e643623e27254c689f257f7f7725931b92b
+size 3606661
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..30df0791953db161cb9f0600b608283fc9267c24
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d94ed6e3eae851290e75ae30bdb19d8528a6e4134f959664064b03a878b732c7
+size 4130458
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5aa4a7580ee03d8d5ab3c8cb6323b6599184e7f5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a1c70bc99847f7130a16a4f60be905ab8e730416d37cf052d94de0753622972
+size 1202714
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7eac849c788aa8ee9c6e40b9e792c4c905bbcb45
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f1695a0df635c6d0b47487c62b248962637c5e82c3c9ae1e755a351897b6d9e
+size 1638992
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b98a541c65ffe4bea9042a69c7fc54f06682e8be
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b98a75484725e3548a1b2a690f9f6db417f668294368b9ee3add5e41917903c
+size 2070864
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e17a068de8543eed6de048ece4062d42806e0a6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a697280fe632cf4b815a84ac602a2c8ccb38def77a1cec0fa66672371b30bc27
+size 2510815
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..272e1181c6a4f44c48f9d9a524e16d2755ac3dd2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35ac2721ac78f562c157e9d49da714aae3094ae2487cf641d28106ae77a4809d
+size 2943137
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..484efebace7a595e3a5e587e71040b4962694fb2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:229374fdf0cab04cbf8d3571fe864e644c79985959216af13f6b44bac78c6ef9
+size 3377116
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab8abd6495716b06494156260612dd2ba4b6bba7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad6b457e7b5c9b82ce5ae28d69e96016e57d733e46332f44fe001c104b42a6d9
+size 1187656
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c028b34dc65639a2e6ac88716b51cf5ba57ddf38
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ba38a7d6ff5fc8b25afeb3249446ab619bdfb3f895b371509a1461892ee65ea
+size 1557969
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d8a36e75d3651666f3820ac8159a64a15d29174c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8e5aeb7e4a84f6af6e89398c234124c5cf9d5e261457273100180aecc0199e3
+size 1922367
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3b5a7ec0e37a437ed51cd052f846c4b6d4a5e400
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5225d56067068cb330fbe9ad53fc4b05e9b0326631fe093d64c354e03c1dcf81
+size 2295635
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f48f24113e61213877fd4ad6963b8f372e8191b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be15d6a0bfe113d7d69d2d9037ba44552d1ee7d62e6878a9a167447fdfee2e34
+size 2661181
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cb0c7503c3a6d32fc2ec88db69e82d8770936d70
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_challenge_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c3c8c967ab5196799d82257b6413f3bfa01c77dca6cd07e1432f378209def16
+size 3027938
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f1e035ee925bf74e569f94d5b0f74652ba8ebc0a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fabae93b36c42e4e68a7913ee11bf4015c25d48609f9466e8ef6a9778f91814f
+size 2351241
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..86cca934c7464ff8312edbd5abb920dcb17494a8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa0625261f84895f43aa22dfc2ce773d18b28469455edadca3dcab7de0f95d27
+size 3174164
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b03c6dc8285238b03d99ed34d9cebe94aa019828
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51980d249d2a39bb1ea722353566384a1dacbdb901aa36fcb950dd1e33c79595
+size 8017892
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e2451895abc85209c4d24ab06d6898eac7c925ab
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5f162717d26b94d4103986f25988e1630b6e2e831d38630db4ca82c613db9ed
+size 9663224
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da38463b7a03242f862ac75dc83134dfe82a0a8b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44666b5476bf3d141719a72e895fe2345f15a317c7a8171c46e57b59d0887c6c
+size 11324712
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..74b3697271a21ac734c30966f713f77de5b62843
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:860339ebb3863d9e7255aafbdf7cf5b282b8bda20e86e604ee7c1f21f46a8177
+size 6494729
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07c4089504e840b5ca7a19dc4750d4b29c3ad557
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5faf83742c2e372df96d659b0078150cfd77722d16ab7b4b37a246d04000e812
+size 2745668
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b618cc11bb228b01d383843c4561b392233215c7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1396633507509943c5d1edee7b9a4a14aac393da285e5fcab8cc3928ad5575b3
+size 3649259
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..55c5f1189e4392035352708f540d6d562b57ad1e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08c8193732922c785322b54979a6afe52dd5ed9baaea879c65e006298138a7cb
+size 9133752
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9050019a5ff49a51d5fb50110876664370d00317
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f580f57aa2dca1e1f12ee350f7cee3dada7d74d351fc206ea26d823abe39511
+size 10940616
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3567be3903e84e594ebbcbae542b51c58a3b3a50
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cab08781123e60ee54db0099661e99e4aa17648def5d9becfd05250d6cfdf1a3
+size 12765348
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..88fa9320d54866d71c0534a2a00759a8f30551fe
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c51289404942e5f03dde67691a331bda05aaf6756454e74920b98be021aeb4ea
+size 7294828
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ed76b9ff4528c6b93160de0cfbc1bca10dcdd4f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3230a6c74f6f1d5bc9c539b8f08280b05307b568afba781933901e2f8acc0378
+size 2843406
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8d9a3b32110c088acd5a567d60c59d009549fb08
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14782ca66309d35fa7d24381dd602d9a72b3ce3daa70346a54495ef6067e0871
+size 3795077
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1f1cb334edb4f961bc8e920eaa5542913329fdae
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ac7fda1d2da2054994f8c56abc104a7c3706bb4d20489c5be1c1fa83ab6da10
+size 9525080
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e9071f9c69eb1b98ed3d16528ab1a9aa975f1a5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a956c255fcf6e63b13a0dfd49ba4e156b9d414ce7cc2b58dbd14ad936d46010
+size 11431502
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c19d64c87ad0f6ced4839f396f32c99a6704352
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b341a7bf99398bddc8eca29382df5111ab77e134af36e3d1ac45ee42ec7b4aa
+size 13355834
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d461e2910b0471b4dd9bfa5d5b3a0c157df90ae0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54fd4bf21fa126c174548cc655ddae65ba6f695812f3bca91a53978541d88be6
+size 7640330
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..99aa2ccc5f290723c9ab3558c1d1af49dd04a3ab
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd17487a04941da44f3ef19413b2415b8e6586c9702cb872e905b0b191ce590c
+size 2322732
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1e84044c2200ddc503d3c364291b66f9f3065b9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b78906e5683ca4f5266c08a18825643bf930fa9ee29aedb40b2452692948c1f
+size 3110021
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5cd0b116e4b349f280fc083351f3647f530628d3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3703eed80ac57af3efd72bfbf5c67fdfe2ec4139ed5b1014cb33cc78467f7bc6
+size 7818320
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..731badef1e5fe6f42dcc2bd205035be945cd5047
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65ab4af0dfa52ac603da9c137925d089fb4bd7fea5ede246d2dfc20415d7fb4e
+size 9392368
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c56626fdc41811edc3e95dc9c849480d9cefe7c1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bfd5661e856d209d06a3393cc144065c9b28a7596662f9c320854c770fbf81c
+size 10982580
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..efef4c9c874f18e358f4d4a22bb078b00ecade06
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8ac80508956c2efa83538d93a56a5e0d9729b0bf9479f949217fa00eefe3141
+size 6288023
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a43c1f98d51d46d1726b36c1485ec8999e3d6cae
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e97fdebcfb91c13ee04c009dafdc81152da75329f0144fcaf8d91bf74ecbdcb
+size 2196775
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e6c02b38d10180e69667642be43f9f1e412887a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b2b4e194ed58b8e445fdf86dbd922cb24592adfbec085cef77d71a86376cb66
+size 2831915
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a02cbdce4594f2d4dde4744a9462e07fd64817cf
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2390d9cebc40d171819066ddf082b1b030860c941be9f52d3ab8a93ebfd6b2a8
+size 6961630
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f37982ec65ff3e5c5862f62719ba3b76ad7500be
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3cdfffba210316f77a6dd47c16f028f9e20de079b14559cf8fcb6efd7495730
+size 4115778
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..27e2dac24af68c0e498c98ebfd9f36afb0499801
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:444c99e92b7d0f9acb8442203c268dcf2108a2e43788028698b15846e09c00b3
+size 4759792
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d4cdb4172bc278772b4ec920d2c1315a152567d3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_arc_easy_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44923ed29c5cbdb16e77811ce6a18a65f94c3fc3382ac82627909dc8d3f83e2a
+size 5403553
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..be48479b8e42bc7ec96f1799067bbeb9cbdd256d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b28e59d1a8fd1bc0d3ceab1dd2faa08d8a6e1f3012fef9209479243055ab5089
+size 3642541
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3ed8baba2f57fea852f967fadd667d04018c62c7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:774842dad264b4663fe77d5c4c484abc9ff898ad859eabdc5467b06cf13f5467
+size 5657602
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a188b352df9f3e35fa7d077b03a6dab7ea5a3dc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:247c6bffa6aaed059e544bbac0e5fa2f9ca100941c409375939fbb5ee3860ec4
+size 30777192
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64945cab40a3af7861bd7aa3336497fa9598de3a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c2fd2c54dadd1f36fb9cdb66700cef049a3dca6ecc43423d6ddfe9cc74cf3d2
+size 19456492
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e32666255a762692bb2b5c6548599fa28d56925
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c291318be694598962e000a0b1176ffd3e53a69802df838fb507a865595b706
+size 23537572
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..93957cb19a7af490f8321ec00be7da49f395cf74
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_GPT-3-Style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:391384919b1be6972ce7561a256952e757447337754a9c053a6e6587e3bf4a02
+size 13790274
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8d40d051166b65843011f77e78cea92e20cfe76a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5f8e0d0e6e9a3be2111286d7a228184e24c8a3a8f542163f184bd448f771112
+size 3984676
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d5c4f1fa7e8d38b4fda833dd8c7b32118097d624
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beae92702544cee3e93f7dac5064e82e8d196f47d79e301216611aab8f8d1397
+size 6167834
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3306e05e41bb09ae24d59ccf81004980112f40f9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59faecfb814fe71485af5434eea23fe0674f0e69a5413164c50ec5d9934cd2e0
+size 33485616
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7b91fda59f11279ea93a98444cae0867b07c5675
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe49bec3a4228c84cdff0bd2d4076365152354850b412d5b90d57a60aca83f6f
+size 21145390
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06219fa8f9c1a572db0d67e1cc3b49b332b94d95
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8472729a2329022c17144e814afb91511d12e295d07cbc3bf789d51f5a4f72a
+size 25561064
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..46fd2d3a5b34f870e18d40bff33bd1362567ee5a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_after_reading_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c99671bb5962d91aaa218e063f3b40aeaeca1dcb794454ef8e80063d6adbae40
+size 14969340
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d40d207590fe585e21fa97aee4bb270de38833f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60753d451d8b4c4fbee9fecf9a14420c3d6d35449c9c791d1b72de95f2ca152c
+size 4041656
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3ea377162d912b74a463cf4771aa6a13ec273349
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7fcd7a9e7f5e4b3b6f06eff7772cf7dcbe70010c7e335642625cd93a57db43
+size 6260693
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..af5519f1ce5077c78ac07991db51bf433e2ac225
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fbef1802054b5bbb2c209da4fe9cb0bad338900ea7dd65fe85f3b4b9697eb8f
+size 34003668
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1331d3e459d60544f603e660c471ef0e66fe8317
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1edb48193ee34b37689305e0ab12b9ea84d2c81c7ecde0af538263b372643cb6
+size 21476382
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cee5f0e8531da11bf6b392751f118c178c0ba70b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b1a3b259fba586e2a1101cbd9654f229b259b4a20385837e91c9d5ab82ea021
+size 25964216
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..80fcd1316d4558566bffd642fb32857386d31d09
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_exercise_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72e14c3eb141ca996fe5b2f5d12593e7069f9d0c5a178a1e6d2f209187f88d1a
+size 15206956
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd772e19aae562d1a970515502b8f25ca59b1e00
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59f9a5daea31239fd3342ee713cf36beed29a90ea00d91aec9cb48e9e5e2a1ed
+size 3663818
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e50e2bf3504834910a80e37281ab04acf8c13b4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4c355db4fcd526d5c3c29fb972f910ea9d9864cb920b1e1709f4b93ad0bf11
+size 5687195
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b3ba6ca2ec0b031103e90b08a1577e5df958597e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a70ef792e55a3de666f10966a721ca497c13f26b34c4f0e418c4c2418ab3c3f9
+size 15464986
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..52047bad9cdf341a2ac465610a5ce66a130508c7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3333f6c234616798b89915db208730d112f001bc2c250f9ad8b6aeb063e39af0
+size 19549606
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97f47afc70f447026aa4df30a43ca40ed3c89c3d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e987aa16a73815b63cdd886ee878d66b7acb15b4e4510f33ba201ce789b0189
+size 23647230
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cf098edb58515421d99d04ebc14d3e264dc0f829
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_valid_binary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea8eef18cdf504a058735c006c802780ea69bea74ab4786b0f92c4126990d062
+size 13853402
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..264fa1c6a70a27a8e88bbed9435921c586cb024f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0579961ceea06ce341dc35166ab86660a7627d49f5c76f3fe6ba68695e9dac5
+size 3861959
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..74fa9c44ccbf0541154ee49b2f53aeaa53718a96
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:625a7e5ace446c73de0da575ebba10a4ac043281d1a2412d01966d449a9604a2
+size 5990495
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..47c0f9f95d9fcf4a4a11b1ead76e1fbbdce1503e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4da4e8287b9a55c1e79eda7ac98540c81ae8a5d234788982657f9ba70286f99d
+size 32568172
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d2a518a1c649327240cef8a24e3f55657f3bdc71
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1716e4d09bdd935d482177d1e921e38502eb3a41e95305185d4666b407775ce
+size 20580584
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..823ae5d11a42992b4a8e88fe3fca40cc83ae1d79
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1811c62885cbe8b19a118b19a1555bac37da92736de4517f2ad38af2630a8b9f
+size 24890046
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d18565985ea7f55d65e66b29120384fc223f963b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_boolq_yes_no_question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e71ff6db7655041662341c2c5c46b10cee22dcede8e736084c30bea8e3e6d0d
+size 14580588
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d96a99663a32fc932eaeeccacc7462560e2e5877
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07d6126480f6c7250b854a223be1446b3fbc839e24f5a67e46dea97a3fc38418
+size 55148
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..30d48278bf67e5431480659628073264d0e676ed
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee178f2b58b7846f3f437c2cb6bf063003cf22e83b314abf1624f067ca643855
+size 77957
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64d0235ee2c44912a26acad15f725699728c5c5d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3533b8ca3beed33f9ba36ec29b6bcdf6d3274490de80b4dcc949b5b4dc05ac93
+size 99569
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6be84150454247ef02c353084a67abee0e4cfe10
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b578a97696b53d8b84f3c0684565e937b715f0764de9e38d0423d22e5c84f6d
+size 120728
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6daa036ccf6295fe27e56319fe47f37cf80844d1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eeb32e50287ea5be095ddb2450c7a466e594d6b5b49a30d75ff72e4572b43f0e
+size 142853
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7b7e251079e1354a79884b1656b911c31aea2bb0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b08878787007a9be56836102a05a521839091755aba0cfe74e5c4cf9d3a6b13b
+size 163674
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..806b11a9c8b43716ddc929108e3243bee2b45fa9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b09d06d81d146b2e5697d769fd76799731df2c2d70081347d8168f300891d6b8
+size 66218
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c
+size 94141
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..92b4f563762469ca5dbc09964f7c904a9f21108e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a9bb5592942ff1c4154e5b414fc4d5cccd2d5dd4abb874961fd7f8efc0cdfae
+size 120817
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..243f0f849d7d4b8120c107542f550f5cca6a3e21
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:604ce9427e2cb068f0cd696c62151652d2d907884f4eb4c638f46741e6394f01
+size 147076
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bfbedd70d97d1997cf424caa911b108cf78d554f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7215f4ecdbd281feb3807bc9dc9029c99113b0f46dbcef52dbc82ce7c5d799af
+size 174286
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fd8c9f707789b08cc23e523117659efae0d46942
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:effcf1140a6053c2229c0166f53b6b91b5b190db8342ca4ab050c3022495e38f
+size 200199
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1adf241b8804ae67ae2e29a1335956269f42008f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dec2d07593e9dd04b809f78bf251928189b67ae4887fbd7e6c7fedb0b379fd5
+size 56302
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d
+size 79780
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d6dc99bbab64f5bd339b086a7c5bfeffb07df4d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58b3e42f00e633f8460b34068fb3bef1f10764eb8133d37192538b2244b06655
+size 102072
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c8aad2ae0550b14f4af694475c842bc0eeb2a2e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0733b873f72e55b0f9a9166793193d80f7319f7606047ed54bebec7a178c8914
+size 123885
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5dda15be96bbc0d0bbf5034740f15eae7a68b6dc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cc47825fc9a2d14cb5c846b29793d3c86e37e1ef28356dc744f575b5e35631f
+size 146666
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4bb5c5c6c7086aa3e4c88cd2719ab7d9ea49cc71
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ac3887b6d9070539056b88b790b8ba65d12d8a5059c59aa213f9b600cc38ff7
+size 168149
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f2fa60b5eccb22db7a9d4a3349ac0a736df94a44
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3f7d261ed59156f53dbe468eb6af80f07fae281d98b002c20c4737ace7887cd
+size 63935
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe22e0a1d467093a1f297f5ebe4e8051f28db022
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e441c7cff54466585b25de3f638b2e845c17a6526171598ddeb91548ca56a63
+size 89584
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..78030e302e3a9c5d68f556a9af20ce128927555a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4e7c3c0a800efe17ff92afa03ff0d9e7f3284c7b9c4bb00792ea4fa0bb60808
+size 114099
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7f6a944b55964de7516202440c55e149ac81d4b3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65e58ab2c0a4ef431dc77739da10fe164401b1355a15101d0f163443f642f92e
+size 138108
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..28c1819f24d14aed568a63393a2ed61b3baa7477
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e707a0ee304e123099707f36bc55c5528b4c430e8ec37412ac7c44c8e2a85cf
+size 163116
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cde6414ee88db6c13af85414a7ae1a68e6f7f8c6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65bf669dab10f1ee8329ebdc29526ca0ce265d9c07275455bd960f50e868bb85
+size 186800
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..845353fac7aa8642b78aebe25f8240155b3b3017
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d982f3d51ba07bafe3790ed771b4b7faa809238d52cd5f0733532f0e524b132
+size 57300
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05
+size 81124
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a9e7b50946f8cc6524789ee50d9af5d17e552783
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f32341c4063faa5a03f14c72a5aa8aa1f800c1dcabe8d0f7ecdeff831030f02c
+size 103746
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c78fdf335daaeefc0fb8a2ca2929223d31642a0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:159c8694a9e11e42279695bdc7a019fdfd25bea89a4685428949c6da9b436eeb
+size 125891
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ea64fde25a6b0731c90599109c2601141987ca57
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ec8becd20155e81ac277d9a02c8b008e5910f6b4dc06d006df9283839bf0f46
+size 149018
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6084c5f89b21eb820ae97a0e6483311dab2e0b78
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_cb_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:094e348994448d060ef3c323d8f3c1e8496806af5aab1ca7aa1030ab54da6336
+size 170831
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ed580aba2382dfeaaa79edc5daf4f1d76c913821
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:527e56ed6a6be04fbd93b63ef73be9b968f2ba5e283e880c82e527c916ecbf61
+size 92242
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02fa43846e99ebc42a588062cd1269539bd75745
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70a7ba98d401f1269413be70d26e704255d2a2b4a82afbbc5bfd95ff9e01aab4
+size 111565
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4afd1a51d4b77664b2b50f22787da13d52abfcc2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:174ae0be7faacf8f28fd711b2854d6bf0d1ccc9512f47c7184aa9ecf2b70a68e
+size 132086
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e3e67552b66bb0e4d97060b69be72210d75aa175
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80765efd7e156e50d8ce4e133bffb6dfb056e287456222f840856dc2e7591147
+size 152252
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c2ce0b844dbab0d779f2ab792067dbfbded43437
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2d2406e26c68cb8ea2c62a833b36785d220574e938e8ac01d63182bb3bb924b
+size 172072
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c7f97076560ebeb398bed634125e95c20c2a5a7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_best_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bac11f395f0797d1c3103dca413938ad7623f2f7f2fbd1efe114089444328dd6
+size 191995
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a7484782d573fbc897bdfe88d2ce004e4c29f73
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3f6368b0a8ca36baaf00b2ea34980466b9c2d5fad717d75978620d1bf58f26d
+size 87884
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..abcd2708ca050e92efa33eafa96ceaf607127a8f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7b67a0311d827635492accd29034a48cbea7551e5817ed8db12ab47cf1834a6
+size 105071
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab553d2b1220d788ae0e03b62962fafe627024a9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a45571144e945d3729e5f84853be16039d554e49775e9b1da07392d1e8842cb
+size 123467
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..57b704ad105edff7e525ff23a3702b304edba967
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:501a057e3a980cbd1a0af3a74d2b4b4db9a9c6f0b62621b9a24084efd9c891f2
+size 141457
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..015e5c59b314338bc5d0a8b5a7ee1359782eb80e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a235dd32c73242092d022f5d335ed328e873003a97b529d25c05b2c0cb50ccc1
+size 159126
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cb6a31a589b7ac034c76d6a5a2f640f49e02df4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_cause_effect_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d14289386fceefd510ea25dd0fd2c12d03c459668f58a94822565d6b8b8bcc8
+size 176944
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..911a6cb33e56ff08f1e0b28c5d9f8df4025fbfca
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9273d9ee3f70ac224f55f890bd49df0a3c7452405b3310b13429d1d8c18e95
+size 85179
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..616b21f790ba18e46d5a9fb5470ba315c843a5ae
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ea0b682929e71a77b51072bdadbde2381bfe772e3768ff25284e9345150c087
+size 101268
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1770813022f788d7caeb2122025d465284a13df4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b789bd2b3d728899500f054d05ea539239d87672c32f0ea2abc3f935fecd394a
+size 118521
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2656fb83f9a58a18070468bb1233ee60ef662d1c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80fa8022bf8ba3b2c206fd687d49a388b4a11c56d85d2718bc5a7aa2705241d9
+size 135431
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d9de7bdc21cc9996571c3147b432969c80e1be15
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a41d602d9d062dd1afd754545d771e5c4888c88732005f0cb1579c6b47ac7056
+size 151973
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f7fc230c76a411ce5a5e620d917eb94912cb1672
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_choose_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c622a96c8cb453b4a830d7002da7de50cc45f1c78f3081b1a7c9ca99bb3bf0b1
+size 168768
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6161657f263a248c7214da53e9eb450e5782da7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47c8475cad09f3a5a457d2abf05c8374bf4c8d28b0f9c1351f68ad6a59db6216
+size 96765
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d96889bea1dafb3e7396f9d801878dbba75fcc11
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f025d23b5bb29010dbfbe495efa17855fb2a26f7d4cfeb45e2bd82b767b90972
+size 118265
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b8efc6dc9b8791160aa3dea92a03e92b4f9030ea
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25ab2abb2fc1f69bab723880eae5ad501b6c1774c59cae2ce4200ea25f7e0f69
+size 140967
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0c22bae9e62569a2345c7afc9b1a2877259146b2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:786d62771db860d6091a9ea8301a1196f8765af40ea446cc348ab8979b81faf9
+size 163261
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14cd2440472718efdf2c69614ca9b9dd42be952b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b490ec3e23a0364a133bbfc38e48741ee8e5055caafcc1f136d455bf7d9f7a8
+size 185202
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..950cf298f95c947c62c14e1f03c8d0a5c208ce04
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e68770909e2bb9031ae45efff922530ab9f2fa0b45ff6381fd41decc34ec797
+size 207315
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9483060da5b04805b368231626d64e9545ada99d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:417f0109e8c4924f24b66d8529dd0d34188bf5c5b0f5685385118b266b7f142d
+size 95775
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8046a0a39f2bd148df5eb76e6ec1bace88873d10
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0e1e646335ce83962d92b9f1bc81d76f5c111e345ede0253d21c8d9b4768f5d
+size 115642
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..99457750110c0e4014205fa7cdcb863b3015c3a4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bee6709c90f102d9b64d9cb18b7887c1904ee51ae044cb789adec2d238c59158
+size 136715
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4432da32d782dfc60d7c01fb7d2e529dcd7c0ea7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58191ce6c30dc14c89007fbee15a62f27561db3528c9118d7a4c55daa6dbe26e
+size 157434
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7af0b2b1d993ab98ec816608cf8e02e00187ed24
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3552af08c021cf7e85d4300a48f188ca0930a7bac24543f24592ccf5aa09f493
+size 177773
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e5a590665d7f5978353a3d1dce4adf05542c6c6c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_copa_plausible_alternatives_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2aae235a54eb9de4db4a4b6e965fa67b5c5ccab8809fac507c024f574e1a7cdf
+size 198360
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c711b7e298f4602c74a72e4d0d8ea6458f31d990
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:940a738c43866f8fdf79b3156a035df89c1655601704a736c6c29b26e7f6c500
+size 3481115
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..19ac5f67d55217ba8a5d4287299b286a4b336773
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35f23a424978afbf55f8293eda9a4044f490cdbd7fb5029bfd4ab417f68a9ffa
+size 3922207
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..69d38d88a98d587b6f2ddf652c3e2c9b5eae3911
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7299274b88b4e432b944b19e6a5ce5f9ace26195c709541bce14f0723725b24e
+size 9790008
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..26fd9c8163910b3494946c328ab3d03bd529cca5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d056b033fc1224af74362b1cb487adddeb7dee35a6b13a04f3457fcfb47be1ca
+size 11666628
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45a3bd0f33aed42eb31cc9d2cd4254766f873357
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e2a788720ec34a2f0f004264c8d5ea4c58a2d1b66b6c9f41c3d673d09af1ce
+size 13534022
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c282c54f073a437383be5a262eb8b4801a24066
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9990a2c90d87564b152ae14674b06c9c546d17faa3cee47c018488eac9aab815
+size 7708055
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d86cfd2bb1d808af7cfcccb69fd8850af6dc3fc0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60fe7266fc7ac25335c72a73cf813e8d68236936354c0db0c128ca13c53a1a66
+size 3398735
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..37eb8b7215fac5f6738c3d17ee505efa219819a6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67d737e0b381fad6ad38b99eb2b13ef3b01976e194deedd1a8b9673e582edc0a
+size 3861381
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e5c39fdd015bfa7a62fe37c2de09b8d6abbe6d62
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15136968e9c526e2d1bbda911b85b7e0185c498b78b3cbe7d3f1b4a0199d67f0
+size 9592846
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9ab7f95668f7ad526f47b2f5c082e426c63eb9f1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bed2adad669a86444992041cb6e80e632930450e374154ee35b318da33d5bf3
+size 11422898
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d883adc4262e92dcc5a3d4412cb96478cbc9c550
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89a9faa368a24e19d795f054a20cac8b2f42ab6cc1d3bf7fb4d491d2eaac185f
+size 13233340
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2fb43c118b5841685468d99f9b90ece45084b2a0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e357367e75c03d4a46833868f6a90cccf43d5a799ffb4083cae21d79b53981a
+size 7525029
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7192925097675e9c1c12331ca4bc7cbc0dcf2a36
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:004f3c8cae02582650b439227a8a457578354e4f28f5999b746f37a9416d5717
+size 3756904
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..28eb733369d72c229cc40719a02bfb309486093c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26d45be3fd18b5d91c4a46a5fc0dc15233fb147a0a08ec82c3e64252e3feb954
+size 5036343
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e61019a7e7d381330e4ea721a8624df736482fa3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50e4fdd43dce2a83cc917277bd252dc97c82cbe9387974e6116267a62a3ca8e5
+size 12089518
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bdcfc5f60024d6a0a45d9e8dced8629b3cba835d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31b81c1dab83e3b6903e30f9a6bb9d875149cda1ce7b8dbdfef12cc978f2d06e
+size 14149706
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e2cc3e12c36dd09c41b24e3ada73bf8dac28880f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc2635e0a3bd68d9de7575bdacb9dd6cbfe9feda9aca8f5b4f9ae8eefbec701
+size 16239062
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..51dd24431cc3574e5e44cfb5c2e871ed454a12b4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:942f603f84fd600e4d7b12435fa9df2882b5a1e69b09de835f5d180b0d245061
+size 9168500
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..49a1b458ea6b142c67a4dc78c3c1b3dce5ada032
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2d3b538aa510328347bdd29b64ac893905532fc2f914a0ad7e1d3094e28de1e
+size 4328370
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..103f7b3bc4b0d6e037e5a699013ff8c841744a66
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44fffba37dc501dc26bd8b3be1b9a324ce6cef050b6b7759db970989554a5361
+size 5012692
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..21da41ba0e312ff3921bf9a4fd2683cc59f844d2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a670fe17f387c0b1c2f6a2d5137a00dee20f56b64c3321df6b78ed3480d2804
+size 12214960
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a4a0c44fd8d2e6937f6a6a0337b6d661ea1d2a46
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f466a7b68cdbcbe56b3067f53747c7f86425a1414ab42eaaea890737cd23d21e
+size 14385264
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..84be46a1e4cb1ba2600db4a03d0a1026dc1715ea
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc90ecedc55073c3809560908e327ac7a3feaa96d40125c04b96c5cb07aeb197
+size 16547866
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3cda655bdd35b700312a2e08b4f6e2474375a34c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73321addc1f282647c064335727d4ec7c99d8d747b343110175085717959ca72
+size 9362925
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db097704e83b63148deaa9cbcb5a7ed2aa31fe11
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46f4d754167f628a1faeeeb1f653ffe915252965b94b0503746a833c1506256f
+size 3278365
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab31f15fe770fbbf08496fde24eb686da6836201
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9881460e16de14233a377fa36455cf8aa91a3dcb87122563c23b9a1b6cefd72
+size 3457616
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..deb7304799afd043371c7479ea6937e13352941b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8771e129fee519b6da9df4d1c51ef7e6df9c9f485396bfc99fcf422f1c740159
+size 8503616
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7cbfa1ad590798037b35d1b0159658b48858710f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f32f4dc2dd14b1551f18ee061cef6d6b0e8d6d2b1a590169fd0f1f73c2e65142
+size 10081046
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6b068ec3ca446ae8550fad22ff14a16d17ef2b6d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6060d7c4a85764f307fb37da77840306f3d1bcc6ac57a90543716af22e2a9115
+size 11645964
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..13133a3a059f56c04feb188f336adcf3f7dd3cb6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_e2e_nlg_cleaned_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:013ae3359d48f6ede2e67c890ffe9aa6ea170d26d464e0096ff1b17e91542c2b
+size 6612764
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b719ec7f013ac86381b3616780fa8cf6a8c10fcf
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0aa1d88cd9eff5f209859c3bba6733f0f2d08b24c397e6d9a75b691d62c66d6
+size 2889082
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bc7eafed317ab99daa7906833973bc3612022f5a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acd97f9939d7c291c6e453e20b0ca0760d1b579f91939e8a38f98c7306b38e03
+size 5157350
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..102d805f52b9dfa41c569b782726ff4c4dc3862b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfa198c068d469c714f3581eacb418ac4f7b5cea9dc1b4174f2645bbd0c3f42f
+size 14793006
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..781ae557f768753c2ee740716c8b2bdc319fbee5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a184538c97e9fc242dbcd330983f2f2759d72fddea2897c46d2efa44e945ada
+size 19360534
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..98e62df26aec83cd47fb3d350624b24eb5bd3e73
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a7c9d83cede257c2ff666a4a7e993b62ff24a6667b777ab3145249e16144383
+size 23607592
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e4e6942a72c1cb82b4e14756f1797d96312800f1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4846dbb5afe8f3f3b7b85a632b6e47c5b5d7b6b463e7ab8cd860aa326ccbf26f
+size 14076796
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2ebaf3c6f4c84e33f1a8dcb4100bc23efded5447
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5611bcbe9031de24d42d7ae2b334477517d3a62bced981291169608b4029206
+size 2785126
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6f147014b73a4658b39668d14e3c0eb9866f73b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:352743341edc89f6ad343a1a6f7649d896bd9291e659153f76abc2bfdc5a144f
+size 4954314
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..09a5e8e4b5535694be73e24bf3b964b8ba2e63c4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d743ef6641937eed580b82de36a7eda1516d126c745dfae9f96721c7238a9759
+size 14323888
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d93cd4aa3cb16356c99a1a9abc1d14479ca74a4d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb52b37f266f74d75be6c7ae3cb1f97a73d64f1ff703414dfc4971bb7f4eafcd
+size 18814440
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b7101ab054a52ddfe348358fad44f91986477179
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d42c7399190b1776227dcc8e0c2a86972352796c7920c48f7e9a88230eaf798
+size 23087468
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..df7b0b272c46265319da3d274e4e24fe2a09458b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_DOC_tldr_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78df2e58db6d1549ea21e1b397c5405fbff91180c4718140198dc4d45a6eeb63
+size 13794742
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb8d61e170aa762a92e7c18391a2ef9bb3a4b8d8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9284abb2afd3603738d9cd727b92c0493ead5ee08a6437124f39bf93b59023c9
+size 2825506
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b2f9f7a23261f399bf41278a05818e76ed4f4bcd
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff3c0d5e83833c806f391e9e5b69a526712a81bccd011f1d4a71ac3fe6d3e2c8
+size 5018596
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6dd4ed187f7803db8ed9b79a99d6eef334c25e6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41f80db2e90385bdd9845666e0e3789815b1148c49fb538549e95a9d734bc2f5
+size 14518266
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe15c882130c36dad7ca0444b0dc4700e2b0a506
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26c150fa4645a3a30a866f72c5299fed5d58669aa995b61b69067f0b1d17a527
+size 19040686
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d1cdf0baca36b44542cee6d48b811868c81441ca
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51ac3380bc1c5451e213c75551065a24f5b70967cb3dc38f1bf115d6cae3b4a6
+size 23281808
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..433a4d96e64415251a89709d60b5aefe2ee33526
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b32c0c4ee8031786b729461329fe3bd3bd728d631707ae7d2c560f78399cd662
+size 13897447
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4fc4bbb04342c11f7e3a5013f66fecade3cc9129
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77ac9143c7e416e84c2fe9f36f16719acb2be309c5beb23c03885acd6613c25c
+size 2821893
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f696c93a1012ee4b811eb1f182edb2e2a4462541
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d871aeece6e1130fd07861c912a99566aa80ab5a287b7158fcad904a776e99a1
+size 5055475
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c7d4438ff1b69d0e5ee9dc5ea661b22dc9f2106c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9a023ea4c4cd5db6f2a5b5a92ebe217e3b9707a3c906ca8a8bef0d53d6d6fd6
+size 14536846
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ecba5dda2f421b67a22f3396ef9b8fb8d601c61f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e09f47384bcfe79c513085e33de007601cb27d0d1bff7eb8742a109c8f9f456f
+size 19049170
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8c20c0734ac1a1de66ec909260860c953624abc5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d70ce6d21c83f0839cbe8c8a12321f8602cc8681fd3972d234a231bf581cb954
+size 23256968
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd76e64b9c4b3cd1e4cd82d9fee6a096dc06ca20
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_DOC_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de91020544c3021eddd30301cae0af654daeffc5abda9edac356971edf11f03b
+size 13882310
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..26d8f30f7dcef1b63355f361e7667ec9d5d683f9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38ce0f75f47ef8aa27f8edaa264100400bf94f1f5a75759a8d7082e4da54f63b
+size 2874769
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..12a1c1145f9a0e467a26e79de419252f0aaee1ce
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bec7ed5ffa0d24cb56c1b41b601fffc023a7c5c2edcf6c4bf4a36a898ff64e90
+size 5087289
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c38dae963f6546ef9556972a5530d379d9809bb9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41cfa691528386f8f3af55372f89da5137f7a4c2ac85aea93d4801915cdc6d83
+size 14750888
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..33190de418fca7cb860c397623d1d27e63a2f035
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:681b07437aeac789fe67da23ddd4ed0b5d283c9744e8dea725a07f81a50c3424
+size 19321584
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9d798ce53378e752d295c8e398b771aa6d4e76d8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6fa1c0786934bb14de816636a8c2b503a78c9dfe1fd2cbb7e43461c5f7c7f34
+size 23510132
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9629e950f70ba532485118b88336067714549a0b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc3213fef2ee4bec328c32e873c8343788ed836b158c2abc1080daf9a1de4e00
+size 14018684
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cdea34c8039dcdc6095c577ceb4cda27e3437943
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21ec3268b912a0d778a3c6171ffe965fcefa59ee442919b7e31947fe1de30ed1
+size 3639222
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..047e44140ac670d6938aeaefa963f425a9e81e48
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86ffb9d1084026a0b3d82612182e575a3a615648e401e2244a16d640f38638d2
+size 2606929
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3e6fbf9594c26987df51950e6704e45ff0c080e8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:575f1b56695ebb2ccd03e4924fe015d8141729ea3072d06f1ebffb346baeef2a
+size 6567830
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8f78733e3e3472bbecad4732d92328b384500738
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2984a3d9ab7c3b740195d80c5473122518f5369904480500c0d521e662d871e1
+size 7914254
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b3e82b07a96067d20bc358d4c843d2d2cc8475dc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84e97c44d354574d22b48248cc9b32970a5dec5e49d2e6b0e6e7abf9d80ae1a6
+size 9301240
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b048378014e594761359b4a87b9b134ba82c1684
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_Correct-the-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:676e6344f66c1cc8d9e136c96eabfa8830989bee637747e477756d336f208c52
+size 5331840
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e99f0a94cf7194d9fa130192d77f9b9053d21cce
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba4d749cb6f1fd33a212e0d31b7e18f3e7ac6e44d37d636e641cf4f852372521
+size 2115935
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f940dbeb98be19c25761bc35c7a71808ad8f76d1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26067438e4935dd2353297894d75055dde41ccddc1e4561d6ceb43c169d4a4d7
+size 2900909
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7f1aaffb6dcc5014fb4afcf2dbe7d457e24ab576
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fff06a709de64bd138cbd7223af35488117918c296506378760cb4660e130e8
+size 3683911
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9ec5d4b952b441c397805bbc4ebee93f5a3c8cef
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93a6c44b88b5586d4341a305bfdd59553c030c01bb362932776441f05fd6876d
+size 4458372
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..44be11a9e10769159329c4cd5ee5ac994063c800
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a2d42015f9fa5a3084682173c3cc37fff2ec12156c65349e7ab244d7eac457a
+size 10509032
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4587f7144be7502728d3144270603873bbc6d2bb
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e02c7ac9cf59b3622b4b3ba32012a5feafa666673a36a888b13814ebd802f2e4
+size 6040677
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3b5d22e64d0d50d1d1ff850ea0f364252744dae6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:932a513ac471214b1701f6cc15d8d37c66797f6324ec703ca672f65ba416237a
+size 2991451
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e645bc6d73516e6d595f14af99c2681f6f56357
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:737554674e7ef263f6a99baa890a467c7ea49c7f785e93e7709681d2bd55538a
+size 2421541
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cc73a6c294b391d8c3f8c6a11b02095a1584e200
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3164d44c6e71dba2eb7bd0d6a38f20da7d0c2d15e81fb654998a368c6d8c144c
+size 5786160
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7166a0d81761b19faf56059571c1725a1977a3d3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:626c8b0d8512ecc73a596f3d1c22401f57626cf296f0d93cc7d79e6c74918187
+size 4271348
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e97b79b6cb2cd0dd250b4f005e3b487378e5e093
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57581954e75d8d620447199178fd2c7d512a0e69dc97f2d02f8d3ac6b67420f9
+size 4812554
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..72d43ce6d187b32628c0502980a1c9e5c6e852f4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_no-prompt-needed_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67d84e0655ff944ea375596650538f1f9b8d3ff8b6a60f3724e280c52509fc64
+size 2696224
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c0c60c006c2b78f485f2d0b8c6368d86599617cb
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56bfb7603817d325b47e18ddc0684b3891e319f8b4557ac464e0a059b371fb2c
+size 1864129
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4bbd6b5503c822db30c54194f926d7c000addbaf
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3c332054a93a1fd931b7c1de4cc2abae71dbd58eb26a1079ec09987bafab1ad
+size 2557203
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1b4d69f4725d9521629dc461f6a2d4d2e0b469de
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24faf9813a9a5ff50cdec52d2ec750e2ce7a993422dbba0fb83c30446c550b32
+size 3248305
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..59ac3b33b2a84d13ec092ae3e083f983fae33b8b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7187b101a7fe52f7c08be57ea9be7382660f33d0bae4990661d2084cb1f53cd8
+size 3930866
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e855df58283910efca47e01aa68554306df526bd
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46d98e284afab9372d12dd934930d1bea0d813f2e0132053049d836efef49292
+size 4635110
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5edc1a074099de1d64a2f0670619b01765e8aec9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_pick_correct_choice_index_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0288aaf2fe70978e4cc14f8c36a293415bdb1f9cc23d9b694359ec548098d413
+size 5329371
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a216d2d69a78417d28b4860ab8fb6db4339da78
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53b238b001f3d8f340510ac54684260fda763de1545d6f498db0db7bfe689c7f
+size 2257332
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2c8447f2e1280e0fdea18397a4ee53fd2a69d213
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e91481a8722e69744e50b5b56ba917a0c962e916276463a54ab68ed98476bf7
+size 3018133
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a4579e08528feb19a9bea96b0b40ad6621e0e39c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ab68c39cb38bc61f3f3104fc03c41c645b3e39aad195f8d0f7f8c9bcd0f4bd9
+size 3774084
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..665ba2a75e13da892849665c1c6390c8118c9bb9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc482aac2ee46cfe10837e2e88ea12c1f0f6efc0574e34bfe124fe2fcdc12f44
+size 4519012
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b553119d4803a74ae3a5ae11bd63be3b960f8680
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd8dd83c64fbe7c46b063ff869aac9a6e26ff3f12ddd316ba98a8bae35c4e97f
+size 10587806
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fcdff7e75ab3a2bfa7b02df34e4cc6103a93f627
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_piqa_what_is_the_correct_ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7b437be452c3ec4c674ec23ce9f63bfa089d9cb0e95eb411ee8733a05ef9d1b
+size 6055216
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d73a4bdfb48834a859dd63383a534084dcb82bdd
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:030e74b43fa1d9b29377afef1f926488deb9cadb9ae6ba2dcd021354c0d537b7
+size 640020
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..295e7aebbf95288d4df4fe1a6a0ddc293652e24c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9dffeaf807c5ff4c77619e76ae0654e36df265fca471fac4f0241c16323f57
+size 755025
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20a5eca3380aac0cce2fc384f37a1963720cdd9b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf7aac6d440fcb9e8a89f251a0f79494809fa4fd114441347bf69a14bf774f1b
+size 871256
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6470e3d965651c88749ef5a14af5aa6f9eef2a99
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66f44ca688cac7eb49bff60456851a0dd067435449f44257b46399063512e409
+size 985682
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c987d4dc70d3d14e3255773e28a125e328f0623
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97e5e19ca7cfa7dae41f7481bca217bde2d82f1bf2de82d1fa7d7934e76b5cec
+size 1098362
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aebcbfd27d9e66edf8607105db321c38f1702682
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f9b4df63bb9e11cee470799ab6189467c7e705a6fac675097c01a808b7b89b4
+size 1213559
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9b8ab2811a8fe8205073fa81e180c99e65a05a1d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22b1d8651baa54266e08fbcb53654e668fb84f598b2cce7d58408f6699306fd4
+size 1182471
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f9c07f58f74dc60a39555998cf24d9dd2a64b932
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8956554566f202945bd61ea5e3ec7b1ce493040ac6e1e6ae92269d028746dd9
+size 1779277
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cdd97c130a4b5a27f0f64ace10926d90a33027e4
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d26ffb0dbd785c375b101b2fae26876025969284cda1fde208b81c241db540af
+size 2388528
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d10414007597fd515f5f5ec679d9908d9ddbe588
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acf97ba1cd92896b6c9015020bba03c1b3bb1d4eb05b3d9a8735dc2fd04a4566
+size 2973462
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d809f6ce3f1ba9919ccdcb1b92ea036cef7e905a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffba294323ed7185096ebefd5ed3219df7e7073961e6faee4e8ce13198720eb1
+size 7111660
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4cea768f7f6aae716bb674697ea6a454fd05157c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Direct-Question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d54ecd0e0e7872657b7f5949372fcffeb0e49d857284bded7e16de2f7fe90cee
+size 4144649
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cb5155d476d53039c1983c3220b876509d0cbf6c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cec47894f5e4e9f8eb8f05acef75d74b7be3997fcc2b801ddd85f46b2585329
+size 1328979
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db14dd79b86b12e918df0ff1aa3f781cf0139cf7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c08e44209895af25df445a1b3b3135521eb2c48084ada08cc62fcbeafed653cf
+size 1524794
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..83f7f9754f0acf47f86790120c583931ffcd1034
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90a6a2ffff784f8f1b67a4634b8e6c49c583757df329fce84ffcc1e6c8155e36
+size 1720806
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1bdac60cacb5d743b15acff282e89baeb41c791c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:747e90a1d7ce9da849a45d5045607fe9ae32c71f8c693de06a50f4c639c43dbd
+size 1915904
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a3a245c2a3efc27eff6f5f68ca1a8453bfbc7a4b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dfc7659f932b65158f43fdea08025885ac2511fb80450fb2e7a7eeb22b571eb
+size 2107466
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..34baf7ae0fa657b125d036209e090c73bd34438e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f1baea0cec1fff8e3b99847f9617d4f281f06b9a71fe46619bc43b811b5daab
+size 2300983
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45bfbc478e053f1143695cfcd04421a437cc36f5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2862c8380b875bc330f73da267678794b11a8babd0ceada0cec943842c25d426
+size 1935122
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..407e6f42c8644569a13e63bad51fbee6af9479e2
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:463e054db3659862c9ee0573aa3ce30e1978e52adc34fe2a6b92c255bee098b1
+size 2636511
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5ecf82b743ab885a4265c92cc710bcb34b333499
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:469a1f6c2c701bf5e23829ed851e43a035d85956d55741a262aa756538bb8159
+size 6699220
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d95217e5ff00fbdcdba631a091a1fdca9f927fd1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d8e9f85950411cbd817d1dcb09e26812500da2e4ade5496835d8275b0d5101c
+size 8078456
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d7f4917c1d4ea6382e24736c5f8dddfad5865581
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57f2213603c430d6b35c2b62e05f8e70840557f257a37cfddabb3511c5e9ceab
+size 9448880
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b3e530931988061f4fc2d96a70019a3e966604e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e2669c8e3abf67e7abb1b9f1162b4b940f8711000e140a8649e5e3aca3e3751
+size 5415683
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9f60b8523b63b6de5d613eb1dcbb34a29ea98627
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a07deb9b062a5322d618f4bec1e7f75e294db383b8c88c53e5823daf178dc76a
+size 1870116
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b02fe934f0ae07802ca67a80244aa343d0adf271
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47b1d6abccba3f959b7b512a8a5a9bb0093973e65b3c8de4ab780f13626836ad
+size 2545564
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d03e1b5afc01535d86b74b00dbe6d43d292d8165
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a1e9228bb693944b8783d55bce647a9b0a77f01139b46b62ed8e297c2fd2e8a
+size 6465156
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cea017b594c18426b58d3734bd6977365cc56dc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e42437f1a22a0548af1b9e17e54790502e2f31577794a575ea38cd3daf025fa
+size 7792298
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..378fbdff5b91961f1ac6f99076cd2407d17f0704
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5329493b046c8b6bfcdeab170834554b86effa90677c303a54725352681435a2
+size 9110796
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a2ad6d48555af2bc945017a1c422a1034fae46c3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_sciq_Multiple-Choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7d3dc3b4e843e4d6d28bdb8fc0bd4f9055eb728b35c8af64a9bc54a161eb6f7
+size 5220694
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d7afb4037a90addfce7d98e05b297d8748bf5b6c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bec9e3c39fa7e6c4d84ff506fe4bab10385681e0b3984f64f4545b4d69e3884
+size 2213396
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8243d5e843f51cb7a200df62bc68cb87e402c955
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca40fbd68a18071132cb417becb4d87f8019233ccd4e84f727c0fdd76c4e299
+size 2974318
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..41c4ad6ca669aafbf9743295fe54c432d271f793
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98d12531cc5c40be538cc919ab32baeefc786ce9dcdbf8493ab66a81c53580eb
+size 3726659
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3ad4c8e2fa48d5fae97e08f02d1ec6a1a8e81e73
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bc6795612c5146b879df004bb8f1891ed8cbbe33f813e0132db66872e1e8030
+size 4478123
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2bba0c0790b7b2a158d04856688811528b592a78
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4e947c9c0c5f6ea790c8790744aa77e4512e2d399652d635694237a7a34c405
+size 5230096
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cfd2feceec1cd6b65372201d5a77de47f95c92a7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af9df5f9d996c04392281e7a200c62156b2307a57e96a1d107509cd189f40b28
+size 11960968
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cfc15b19724da8b5a7f40776aa0eb3d2de8c7de7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3523a4f73d54dc154f27a522df710649b64a52149363654e1b5e69dfa697f4c0
+size 2361385
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a5e4ae41add81ed25fd2cfbfd00ccc18c349328f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f42a432669fceeef3a6a77c929aa6284ae9ac5c8dee42809a698423f184b1def
+size 3193630
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ffb2b263dec5d997b2d882b1a8dab58360f6a2ca
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60a3e826e05fead9eaef88ba3118b7a2d0e79bbd14eaf380c9c428d3c221087e
+size 12049896
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..858c50a1c9f9820dcca1a7c1ee0dc387bc5afb0c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1ba1561d7f5d91dd1dae227036174dedb4c29bdf58659436ff667d2ac9bd967
+size 4839321
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1201f2319f9ea1f0975ff875c01bab82604986cc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70a9a9b25b8501259e50b12785c3d2e0dc819ba3f122b609a428e01766be9e87
+size 5662151
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7cde9b69a77092a8c2e5214f5471c6d53c7e83a5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1792a59fa0d142bdcb59f017d226645f923f08fbffd3089e5f4cd0d5bd453b8
+size 6484028
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aaf4cc4da11bb9fd68e6740264d481c4ad43a5ea
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34f2d5829dad6e83abb27a576c2fc55fab498a255dfde0e2946c6751dde67593
+size 1878973
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e2fc052d343b020ccdd040526834e1ef7ddcdc0b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e4acd9383672f9ae6484b575894d4df956f643a9241d2c1ece0be535e62186c
+size 2434983
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..558a5129082f0baa3697bb23227d4445d785d5a0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:012985f539cba57a7de7a00080c3d03e8a70a3df1233994c480d12a07a5da111
+size 2985450
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b132efbc868c62ee4bb6a5f6c1cf8f76e9b67a4f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0869caf11118a3e7266944c19afc8da3f83ed65f9e8f74b7b8e00d1cfb38c55
+size 3533968
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a58049ee2d3b12c2a170b7f901867dbfa2ee046
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fd4f6e80de864b893c274c4d380fbc43d2fbc77e5c8e2acb97a7da18f220cdd
+size 4082301
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a904aeea7618548da102562768d6b47fad5f0bc3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Generate-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8665e580d3e3af0664273508847821eedf401070095f012d2a8cef80deb3a973
+size 4629419
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..046f81b0e6ddd985378ffdf75d90a09a822c4032
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32249dab65447e380f7b6b90872b58e58b3e007c4ecb9e690d3da376957e955b
+size 2370764
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..08853cb31bdc0c5247970dfcf1d217b31940e73f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5faf1e011cc7ea6d650d767b7895b34db744878719d52d925f498d81bbebb575
+size 3210222
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..733c67a22a647c559ddb51efa6fdc6c2c23a0193
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b28ceb6fb6c75e433148b9785460468761bce0ac20d84940c1cbbeeadd6e5424
+size 4040925
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5dec2d3b2fb62aba38b33ceec79323bc2e0fd887
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80549a7de9835cc51f9be1e3a3e736c242398143d7ec5c631e3c92c2a42ccb0a
+size 4871152
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c647f0cbe1fc06ddd7a1d994e22a1f9173bdfdf
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b71cc2228a7094056641747c3588a0c13fad6427b930a1ac42b3742796e653b
+size 5701707
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..05cf1e400e7f19039456ddf80ca46ce58ffa88d0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1d2bc9c81d36b6a9678a658e6e7bb0b0d033b8e93c246a25169a1d20c7a2a4
+size 13061420
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..83efaa2746bbeae04237446435251dffdffd9f97
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab7e0b41246202855b1531914b8e3cce2ee9faad67133c5a756bb46c95a14a4c
+size 2344652
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..668212b412cb20633b32f8aa5038516fd4a754b6
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffd6519ff27eaeba12d13aaf958a9fd91b8a0b932b2f82196d145feb0cebf0ac
+size 3158349
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..717a42d8f47fca5ad170174e8bd5437cd055e4c8
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05723f6c60dc1f9f34472a42872c6023e88513b290b54704a99be4b9a7f1bb9b
+size 3962608
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71b69dc388996c44156a3dde521980d29c2b6feb
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:770e1c6482403463c2e333aeae0fb5edfd28e4ca47d456f26a33233e6cb043be
+size 4766436
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..01fa148c99278b825038991b25b14f2718724890
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:979da613c6a06a9320c11a9077dee819a30609648e83bee1f58bcd42ef43148d
+size 5570653
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d9e496c37963812e35fd07d5f551bf310ac09c3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:937206225f91ecad615771ea4cef680e39d729a540c30f48e1b70c9e6d87cc2d
+size 12747258
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5eee96cc90196b3140396101797edb07a2855183
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eba308fd765f64597d4baad2f64bd20e4d7444bb7b1d77d42b150b9faa4af206
+size 250483
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d23ad9d7915aa8df2b9695158e41f23f2c0a2796
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d47305d8d34f97540d316ff9d421708fc1797553fca63e7efbb0248fc7841971
+size 351266
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe6036731a7cbe83ccb27595eced7d204b7c0413
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3050fff23ee91dca6d238ab61c573003c8fc3120eb2daad179a95b0f35b93425
+size 449581
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a6a815df9fc0a0591ba44b79a87b45896965ccc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b31a2ca2d40f42c2f2c398800f1e32c7a32b64b05a73256349568e847632a23a
+size 552948
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cd6c43084adee96c9e44b7d21879a84adf5f42d5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aebae8a1f4aea3d271122fa0091486c0e731d25eb1afd08182e341e6e2914358
+size 651568
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..46be1ce038e085831e6e4b9519310395255dcf79
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7890bc7676c1027c8b6f0751b9e672d4b3c438321f3a52891cdd73e2e3d0bd19
+size 747471
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0b35698c2ae27b6b681684894c0a79f0ec712239
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c315b3da69efbf3c69fcdd75754a6aa9ac4ac556417e3ea0f141cc2b57b5caa
+size 293152
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346
+size 415394
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..01cb15471c8681a2b7669af1dd80312d07360bcc
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c17e982d9a26e10b20a008fae6b8db2e74efe3c10142263abe386fab7dedb0a1
+size 535335
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..625ef43d0a0e984c69adab6c789da9bff6af582c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dffc82605b7fbf21df76a5008dac6b176e76f9eefd6ce05d3db72dc81f3e91c
+size 660328
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ed39f0457e9d873424ee3999b1961f4132fb60c3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b17a108a8f239139d1f19f22258ebd3cf71e5f26dc654d823b45b01e9936176
+size 780564
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5022d00d1ba2d9729651b98f00a64978b1cbd27c
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39e975fdc3afa3f2bc2a10e5e450e048f7d2edf906d844b5ba4053c7f3b1fdec
+size 898085
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b17abcd0d08773b8cc4eda76ec0afeafcd93a94f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77d65abe7665ae544d0b4537cfae18aa0d2d955fbd9f3d78026e102b053dc243
+size 258505
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d93f306d7f158c48cc7c23bc42da896e5fb484ed
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4ac01ec9c4e2ba835795032285930957a0218a09c6ecab9a3e5105f6f6eb01a
+size 363042
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..58c6acea8ff183344fbbd10ecb7e3f16c29a0cfb
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21bba5b5502e0cd3df82c82b76e0e7d1460017e435c06c558627adefcdda7ca5
+size 465261
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8beaaad4e74b2a520b1175b564cf1db14c541c6b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4306d58f0a0f1221ee5f5ad531b44997e0d24b918885bba0b4a797f1a3511510
+size 572538
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..61d6d10a3836c462b2361cdf24ff5c23a3b19273
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7830b5f640ba7535b67fc2929834463410e57e72c4ba6a64fac8315f800ef0fa
+size 675042
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45b177f44b610ffd68835aaec56e366b12b7372d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_does-it-follow-that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1b18dcaf7efd3b08dbcab0f2ef7cec54c648d6f0575b0205965a5ebe6c3e241
+size 774828
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2705ebbdabbb02e5c78a3d981e4ddf43edc1a25d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a02c56708a2a6058ca2f7d73b8ae070d65f5bdc6defbeb75ed3cd8d2dfa0536
+size 261294
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d
+size 367750
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..297e4695d70e2ad45f8a7caf49aeab7463158d5f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:950a34c355c884934d298f3e84aa614007662f3fa541aaed56452b11d11fc25f
+size 471899
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..223cb493d0527920e6474574cbfc67e58888c274
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c3fbcdef56ce501726f2e1e0dad1f07756856591bf24b815508c5b39e57ed88
+size 581099
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..918163246d86b9935a2f7f456369437bcf26d9de
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e27d0f2d1db7e962f6ffe1748fffaaadb6665d7aa508eb45575548b841cb6b1
+size 685540
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ed4be2c654646a2ed8d2b3374bef52097e59174
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_guaranteed-true_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:969389a220e4d6195e003886ed634f8b661edffa4b797b0016540fcaa4976687
+size 787263
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a137156f78459bbf756b2e17406aa18a015c959
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5767b8d896adbaffbcd61bb1c7b37cf1560da6158a946ccc1602aec988f7e16
+size 262402
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f
+size 369689
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1f9caa668bf85f6f329df9bbd6368a3e6c174ece
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b45af7dacdf374a4658d50ec351b23d9aaf4a58087b2988eee7ca46444bb9182
+size 474662
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11e5cafa150930eb731d6700df3538714c8d5100
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c26eaaa71d46cc1eaf2da9b01058b957e624fb79f526a14a1b01753bdf0d6f7a
+size 584696
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5ccb2ff52222248c7a0abf03a8dfae223fd9b726
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d85d0daeded240ca94fc358f847048d8ad39a58482b3eaac5ed271b438545a92
+size 689963
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..26880fae6c3a1bd60c1ab55fa12bf43e945845ae
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_superglue_rte_should-assume_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d716ecf3dbef285a6e7416e51c4852c2bce73d5be85de7375f94a73930501259
+size 792519
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3902f807956e3efde6984f82b2e443886295a89d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30928ce488bdbfa6d4193830369bd16f16f40a0f4e6b9c7fbf13804069f0c3ea
+size 1039114
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f96f626c5dcbfe50c43d15647a717eca9196392f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e396a209fec285fec6a1b10e9578a8a230a5a8b0b7f7651d33d8d92bb98f33e9
+size 1300132
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6ddb985a6cf623156da9d9bd1da770dc74d09385
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab97d854bed93b55fb194c4662a0938c42fa6b42f6c31f64e406ca376d34a27e
+size 1561232
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d518c6783777b108af565b52e5528a03df6717ec
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6b143f8464824fd250d20f7f7b8ad1cb84d8c546b2c75e5dabff3acacc1f19f
+size 1822701
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..261659439f040595fd1065425f3605424296cbe7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fab29d3c18917651d6bd50f5d8932d1646bc3f6e4af6878f4575bf2500024703
+size 2082990
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2991a2945354257985955a75483d3365d0bbb32f
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_Replace_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eeb683966b9c5860328a9291bfe10431c219ee11dfb447ade9ab86ed4aff6a4
+size 2343673
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e245f293005aad140bb18e66c2ed064098d75a7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38a93b006d8627048357bf8f028bcccae590a0c17ec1e69108b542f48880ed6
+size 948111
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe99c2496bb5cc36ee54c8ea02be7896a4ca6b29
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4b33d2bc310cf1dba7c80970eb6cd3ffccc53f3804559fc397da8cb96c7db44
+size 1180553
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f0fb92b054742f07a072c2e0e01cad84b4ab2df
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfbc163526e84fac2290a8a1cd6cd82ee286c5383c8fddadbf6aa35e4b07fbbe
+size 1413244
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..609180c95787ed331e5cbf8a4b871834053e585e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea563a45fe2063227f4b4af62ad666d37bdea3e80d6a10da244aafb86246e3e
+size 1646164
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eebb3463a832c6764dde54d64d9d90693bb9df9a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a6a0458d7f398117658801235cacc8da0443d253170a94c14f414fdf1fdfbf1
+size 1878055
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..882fdc2700c69fd20d8738119d04787ef99e6105
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_True-or-False_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6a9d534331c211f86d57e82ff51025aeda34da7dcc854739bc51fbd9306717d
+size 2110209
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..556264cfc52df946c437b50e4da3a85428af1261
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cba06c807522e6e4e41e30d08ddf34144e1a7f80ea030e12092eb13017666b20
+size 1010176
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3cef49d189a219ace4dcad2b4dbba529a1b9c312
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a524b0901325ff1f13b99f809e1d922282fda8e44d2b01efa48915746244c1e6
+size 1243134
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..623b7adcb08a4a551a8f1e9b0b2da635308e383b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4162a1770f8f435b3376f66a9b23bab8bc75a901a82f8f34e66e01357969c274
+size 1476438
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2bee980f0d91abc64183bec6e9a664d02659ec40
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a403e015e368be99d3c5fbff0e09e5001c33506079cd4f3b8ba467a2415109dd
+size 1709962
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5be171be5f4267ac8a1487273739c90b63a49f5a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:168969ce76eb89e03d3597e09583ac3d322da0d0cf68631992524e7f0ff76dab
+size 1942405
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b5fdcc3f5308bc2d26784e9c76e5c462ecfe95a7
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_does-underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27f09f0588cb4d1a0db72b0623e2540a1ca011c5b32da8d8bcd6c3a24636d59c
+size 2175227
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bf8a487172b708bbc7145f2ac6282ee6aec1b6d9
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8afad79d69d569067060aa3420b32eafcb6d07d380e25099b69b7349abade65a
+size 969511
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eaf2b5e4f21bbd67579459066a9acf2939c1a39b
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fdd876effe0412359ea17d482f6ffcbf2b1a8a5aee0770c566011d831383790
+size 1205133
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1987ef5a741c9f03f1270d5988eeece1eaa01f47
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c1a2f444f9c275247a4a57938d79bac28d191b4a06ae88218f83b30299d54f9
+size 1440966
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..82cdf9f80ce70eaf9027d3a290f39da21bccd578
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecb1b9954513651715bcb739ef18bf358d6d5079c72ba485d1f9751fa4d8dd6a
+size 1677037
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3acc61c43c91527b78e88ea443af55e351ff2af0
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cabd25e88def735c7dedf595556c434b624f56a388c42f23f4853f779f0f8896
+size 1912014
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cf46452528e9f57c729d8fda6e1fd56b5e80b95a
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_stand-for_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1c37e5109a29566f78241dda9d21097b8ba0004173a3061a1484ee330c7670
+size 2147374
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_0.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..013bcd3c07f5cdb361ead40acd2bedeaed580a64
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d68cabb95da3d67d36e7b13bc06038d6326d07c5a65017fff257842b5292da75
+size 1016445
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_1.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b225d70896927f37c4de35e9eaa1f90694266f1
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94986143063e4f139837a8799a8260470c6390ad365510791445d4964d208b37
+size 1257133
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_2.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..136b2a68dade8f3ef72e5cb8f13b3264bee885c5
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89d90ac05dfa16a7edea75c487f54f023e1f234b724d9fce59c748d87b647dd7
+size 1497966
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_3.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..66157cd14648610ae3ecce2df4a0ee9f3a9ba86e
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a9d00a7aebc023794385d1420bb3c3990c3a8e9d64aa5915b05ac702d8ecd79
+size 1739098
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_4.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2769938839d520a0ddb3dae47be0c28df8f935e3
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b78871d3ee293fe269283d1856db4c783a244b5b61e5154825f9881434bc8df
+size 1979144
diff --git a/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_5.jsonl b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11601ad4b825b35709333f0ea39b479ed45d377d
--- /dev/null
+++ b/4b284b12boscar/eval/examples.4b284b12boscar_winogrande_underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41e97285f76a3a684bd96e4cad3469818076e7694717a2c13a60a5e1aa6d476f
+size 2219587
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..30db2b885b9fea4b2463bcf9d5af106abf38e186
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.4358876645741479,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04475556276641748
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.08601617848273047,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003027222218230607
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.29226306498620885,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005344776046578116
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.11084365567127677,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022963454664604076
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.04041394987853437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002132583963194437
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1415225530229299,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0034803378779995004
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.05146498360167007,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014277295107541841
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.08239970522779808,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029176133842350504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2827601971319833,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00519215142624447
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.10611300272902285,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002132214334314073
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.0815748628057772,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002939098844173709
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.2756279858381666,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0049864009961195395
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.10445493190549919,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00215267434014802
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..39edc3b68e98f6343344ae5b386f38fa112b2318
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.6458190048668558,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03274318464461181
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.13985160542322855,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004374312136397895
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.32007070778063884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004962532451445636
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.16170102806244036,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036142070180597507
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.0694179465016437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002921286815952664
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.15883761787677303,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003480119679954502
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07885240324677927,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0023523131620574156
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12591553956133228,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003854734977633317
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.30061625652088586,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004630766957074663
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.14742576479874345,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0031238502419319854
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.1283694931084644,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003947705918972332
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3021994113995889,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004647780404212457
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1497000837635611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0032201737172114866
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fb387adfaaca5f41991d53830504e2e77f574c3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.7970016196921748,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04581096081746233
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.18059337363344904,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005339355314623877
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3595991748021704,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004980869877485108
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.19703997895188197,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0042195368281687335
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.09688101950971822,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0036313101633365223
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.18755499121707667,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00376908816872013
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10271084479951895,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0029196673345892295
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.15982668252407492,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004603772659928646
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3351391580099154,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004585332134390738
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1772530360597836,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0035959119026034024
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16383301190827276,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004742860098171715
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.33868792774850437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004634428210263519
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18084941775566396,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037099919587234576
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf2c4a02f9976b7237b29aa4e5d6cf3aed5bb0a2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9098079216712224,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.049900879662925314
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.19212508744736018,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005617174437077427
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3703600547581712,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004938819064876214
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.20522389412159425,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0043615630325957435
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10484577205978293,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003926715523602227
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19383710059664028,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003742953011901514
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10810375284373198,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003075123686799679
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17083343128882605,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004911086258050898
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34476103034505623,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004533133149997157
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18478247984025434,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003743746721954891
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17532235438945584,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005084911003429309
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3483193020595072,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045873162051077825
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18830268691133512,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003857827801954891
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d72ba65a841017c0cb6ef9ffe930a56da82d231
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.052600188336286,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06665675598684426
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.20108020030540724,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005644545699403245
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3818834779332785,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004965307876320074
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.21317352472996992,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004401402126159575
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11098039808305214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003927637491564618
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.204923577311545,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003948475544315969
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.1144786865204847,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003126094675218737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1772302061036221,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0048597986989461165
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.35524087803093607,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004631433081108199
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19087293738824496,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037489593597534975
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.183821519223083,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005085692199041701
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.36063124635272004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004673788271599251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19652423173229852,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003927577783905062
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..742414f4c299e217caf6898d4c7f281221473e62
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0960931080747824,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06344656593457922
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.21325470731107665,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005803174587983953
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3950162351216541,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004977496708782323
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.22442306072164447,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004490024810310396
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11715823165872309,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004000375023616823
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.2112966644324947,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003874637116812206
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11914106590715341,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003087212545803013
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1866801247482292,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004986038924327078
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.36529275376302506,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0045911093135183
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19921393354497344,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037728930475513494
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.1931918251374917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00519492169870331
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3699764207220656,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004618932928096925
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.20449901465956502,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003922872033805495
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5392282a62de65cbb2974e8eac0fca38d5eac12a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.020068624571258766,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0003544600157797195
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.1322006158687265,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0015422644620452844
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.03365291879999083,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0005366360315631209
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 7.220799408035192e-05,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 2.9844951292528377e-05
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.000540983067735179,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0002518276515393111
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.0001247128207234875,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 5.298844359373387e-05
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.020051794824344497,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00035194324590660044
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.13215658314776402,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0015394013545400771
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.03362856973591861,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0005332360539269732
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.012103709829564288,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00020447718700316613
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.08586525160141571,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001063601998485439
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.020445712613489434,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00031075004163469113
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 0.0038097370395060908,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0002219610347061402
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6128fa44b9132d6a363d5dc7db09e32b8e0f05f5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.43626603059505226,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006451581496667008
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.35883293430646573,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005004495762257855
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.3481327312179078,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00455441730109508
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.21137487603719227,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005114961059001828
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.16806612977368068,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004004860983552607
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.1630534157576409,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003746557325381462
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.3652237379799452,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005732161684830544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.30192882739130533,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004466009943504772
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.28990188092888874,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003995413413537247
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.3840495459688683,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005925377180947352
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.31495878380419884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004547250127908277
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.3040864791542594,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004078079672821507
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 6.989261876343655,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.40263571084172284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..307f8e65f893ec70076322b29c7d0180196b7497
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6362956511318935,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00538379300124836
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4834946858936047,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004801472074915238
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5078107057324294,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004140666670121251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.37135935928638336,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005066687538961897
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.27760684248102696,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004280106934969935
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.29067052011025934,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004014125890256947
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5265082399354665,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005036016101378371
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4026738032734296,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004454404008264425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.41972809208999595,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003877940140365374
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5567637819834128,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00512230716108874
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4214614745436478,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00441184308930783
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.44168829809558013,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038261243388568083
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 13.165075565004848,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.27581373067423187
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ef49bb343ecc3bf932bde20fc3366281578128a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6553503707751134,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005259442835843647
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4846986680036073,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004743810212237689
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5164843715667228,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004023809678916401
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3859418988153908,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00508153143197431
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.2822822546562199,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004225169829828983
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.2999645170234003,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004030255955877291
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5440132947723192,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004956312605167472
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.40534188613718314,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004461515220119515
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.42918810840416266,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003874643346859152
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5740345018130067,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005053174990259695
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.42360054475089143,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004428112509567798
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4507171622999017,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003825881679108077
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 13.845066320476562,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.29872110566731597
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..55955f309a86b97a8eba3d1d61d3cf4c2421d445
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6634050196446688,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004886612003234737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4946481837392698,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004782084733979873
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5276390672600301,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038324896467705604
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.39296194860834516,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004819100077639621
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.2919119739384423,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004378654594933089
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.30837818748213996,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003926426979248677
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5480449518158801,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00473481568788781
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4112826943812621,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00457624935912133
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.435888745068994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003799604065357722
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5816122645097909,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004785030630138882
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.43250226005582193,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004539900910445077
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.46048000687942087,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00372310461186504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 14.04704336798738,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.32871093104279264
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..be034b7b14ece7138468fc03a9c1a0f386e88849
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6713532949429953,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004849071608319205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4972585619402549,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00487037629660061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5333758121056072,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0039026319169523597
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.40052984046218604,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004867933712938873
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.2959925459372112,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004393215975299111
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.3145492300965214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003964256746686906
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5555980201751652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004696669229961967
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4141273952452268,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004576109419667718
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.44150951115272263,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003805310353862312
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5896648146852733,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004767497844568815
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.43581277222608317,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045433886339008465
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.46635341830341503,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003728106132230896
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 14.378790163362357,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.32798586772214977
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..11691933d348e96bd5cc4c448d3f991a8fd4decb
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 0.33024351617759024,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.051704359395549765
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.009770361346008592,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0008287344074198066
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.04975365662487859,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029466525852222055
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.012752235193236958,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008894854760023045
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.0019298118317708673,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0002966982970755425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.010649081491194475,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001169743419884876
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.0026950461011573386,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00033028725537539126
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.008462458079394903,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0006841590966581536
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.04579276197685639,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027055630960688658
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.011182273722871986,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007469860690080579
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.008468919439670847,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0007353173223218566
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.040428841607895255,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024829119630856503
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.0108012920574524,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007827266712588734
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b7395ddfd9952b0d5735f2e830571885087e145
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 6.477679081695303,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33565119461770165
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.46830027855874495,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006300101063417185
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4241912009571325,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005104576106588849
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.38775536001537486,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004542171141416536
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.2406644049185146,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004806392395353992
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.21737008513740708,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00410188320499371
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.19596829580492064,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0036425337193625837
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.3847305010892318,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005484468612669837
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.35286837769549867,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004536843256864709
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.3181224584805221,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003913082026303714
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.40901456737899233,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005705074341293744
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.37079747847290656,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004607267630637304
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.3369880890693486,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004020946462748482
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..06b7ec4112ad094c95ae5b16285f86f0061f9f61
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 10.552449097458222,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5075359348522721
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.5925714099982897,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005886614000972573
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4987525489495299,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048306591159496546
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.4908297093515496,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00440292254550668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.33677107097184933,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004976427241032287
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.28054277473630357,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004218111612756725
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.2745170311876481,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003943249150000542
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.4820426153517795,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005228080311572792
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.41071941896392805,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004422749977888421
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.39923737082902666,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003936921688224572
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5150722423509299,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005454009317285427
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4334293723998458,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004448148760691382
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.424618929381837,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004010785107308282
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..14423f06bd6817afc229fa17eab8d6d10ddb78ca
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.529959728631091,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.6848885034123435
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6291060498809192,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0053931097015041864
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4998057724661308,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004853037618098533
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5118142183656486,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0041047394326870785
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.3647296831743616,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004961071506430092
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.2891088910523139,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004352334361895284
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.29393229614602107,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004028779652804252
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5178169597520474,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004966872786567351
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.4154131273981219,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004548123711158703
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4217841035478285,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003898458206388385
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5510576711663073,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005100868381183197
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4367988103969493,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004517807079232531
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4462117602532069,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003857887615444402
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0ba9b8977195d1f20eafbe3ad870269c1467caa
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.501938799210457,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.28415569927247564
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6469147946550013,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0050072292924981275
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4985296869712893,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004847713959588524
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.523291959319419,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003969739964628797
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.37974321642248593,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004758307047276704
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.29279785614455683,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004396790314269892
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.3040146901882556,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003948804430975799
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5316402821401439,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004701609433025525
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.4143927386535714,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0046130550286461736
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.43115407624884194,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038357397668621392
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5694903223235266,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0048218066519585745
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.43811279641548045,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045849735847942824
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.45876891046754215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003791252529951684
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ecc4c055d454970c4ec8ce06a5f767910a270b00
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.556796585022564,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.27852899955430843
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6610002782815263,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005017622845157595
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.49345537230504805,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004924666218534147
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5246830517987607,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0039796631999095165
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.39346059742147355,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004835604517590671
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.29186648429984857,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004369750234644563
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.30795637687201854,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003917110735112965
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5472030882550708,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004704868980457638
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.41209896039479726,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004617070429531718
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4348976156448126,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037848622247426116
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.58299611946106,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004785834392496742
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4337099508428588,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045686487281792094
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4603023464895643,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037008836133842087
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a9c6ea391554e55c6a879b39f3af8555384e638
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.06706070850923308,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001641705982706511
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.2309061560559356,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00584465805287697
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.07242669297423687,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001681684378008165
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.011010788571996902,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005458956739597873
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.08004790916119307,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003344495650227847
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.018236794772880795,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008615346385667937
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.06355399391256518,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015568088892838425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.21801502942955459,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0054970304208384465
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.0672739987848506,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014524145543017358
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.06145516925393397,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015616424129383
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.20027279793496214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004938295524532196
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.06387955329019673,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0014285492353000519
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 0.24984031541663096,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.012924324247896772
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c003006024f1605b6ce4348f728759ff7f1d35e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.3181030735546715,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005373925400256652
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.3251558121181215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00531664302610407
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.2792227778567483,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004263305560309992
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.12418420030878015,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0038826294171554997
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.1312949228236328,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0038357971846482808
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.10849910513809453,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003156980188228955
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.2631322967918368,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004569485790819501
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.2743254974329521,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004722899827718838
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.23040067712932039,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0035540519679111504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.278813174699537,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0048188144324256485
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.28403936265612606,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00469510820335236
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.24297581585449876,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00371995328995244
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 3.705116187171442,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2792269501223854
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eba64b63391b9219bb174780fb47e767b67c1691
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.600466363774825,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005282943419136165
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.4910404404053262,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048916435292608714
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.494991635014208,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0040212580043449625
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.33593288364017954,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004653747053149475
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2768267729211498,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004295843993088077
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.27481294702147024,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0037598053969334754
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.49001524253465367,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004759003697565942
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4057892261642288,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004509538826924196
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4047413734025506,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0036897985733284535
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5207320474983632,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004942597449806194
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.42593556430915974,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00447978031567887
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.42756211299071967,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003668514458622858
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 12.258816180492442,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33509170789115567
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e81fe0d65b19fd4d251f9590b260870062aa9716
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6396291098157352,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005012372082127943
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.49296712645136953,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0047855206125547054
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5154529984750105,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003810967011831987
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.36107447936375614,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004688596438977035
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2798463176070886,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00422023661159036
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.2891214170531268,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003789802955066307
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5210394733474353,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00467589207565221
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.40605775552470086,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004478812348360755
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4211027205368336,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003666346201802303
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5551710306111246,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004806611169149116
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4276039412948363,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004451737455618187
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4457300868732614,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003613131206631707
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 12.641415106662606,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2559752456319122
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..91395ae64cfd77497b887fbc4925f98453958a94
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6533901529130552,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004750298489587612
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.4978245547862729,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0049638184483227245
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5254658165967703,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003834943975498437
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3739710377631859,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004650812086245668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.28782083216563187,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004469320375742115
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.29898924205288324,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003896580955933994
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5341947075791592,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00448509853552663
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.41109794264396293,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004669221276796458
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4305179535570766,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037087908384292726
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5678536307532968,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004614146022878327
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4313369622755667,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004611186352566589
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.45413143397226563,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003621738095592773
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 12.51566452371121,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.18662930850276757
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c799bd7181d7c40d2393936a098cb5c8407d7ed0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6634432758822567,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0048481143494641795
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.49724281083508204,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004869426527869909
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.529417668132056,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003826308214918395
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3899421272006655,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004781244315624528
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2916271410817384,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004359700641177092
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.30703573949811763,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038660421241393456
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5444083011792844,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004568064871523696
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4128466016133311,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004599967068818717
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4357909154278315,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037014554665568476
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5776387605813129,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004683314725587799
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.43239954634861594,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004546461380714694
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.45908015170176464,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036338440583547262
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 13.024109481625127,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.37467296505230513
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb8d73f363b25c3b82f35b8a6264573279d22662
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.026407959502331903,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00038868194633258883
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.22356670586627483,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002332253284549633
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.04588118282553198,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0006171605320982103
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.0029537913433360027,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 8.371907248046422e-05
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.028781138144721528,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0009026844718533621
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.0052053675573166145,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00014522168539481952
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.026294482230459135,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.000384669915057379
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.22280019098723994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023265096508733036
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.04568781459679956,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006108800927922479
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.022403360020811003,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0003171490032197158
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.19384325682164458,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0020160161936303308
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.039001539304585,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0005037468676529433
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 0.03302516501887599,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.006176680874969092
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7ca210e442ce302e3065d9adc4c111e8bfe695b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.44343574580605305,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005836165049538884
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.41167731843976096,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005244681170881314
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.3810155399369177,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004510941559773663
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.20955577077441034,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0045867433415744925
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.19673389590965332,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004066132879496505
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.1786263826302864,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003609690020384216
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.3659912580345323,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005168110662696892
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.34168226812319036,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004631056048180813
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.31302032604227675,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003914907165679426
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.3879497324594378,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005327568561873762
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.3585198739927853,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004665725786249284
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.3308490189257667,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003986497508848012
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 5.92155994361654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.4368162501010792
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc20610fc2449167e6ce77cc0587b07bde5909a7
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6148890739123269,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005151170963850425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.48751527677115314,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004910148217088944
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.50099732123994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004071389975898556
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3368551697989736,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004690360563606446
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2679764741145998,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004297248080849364
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.2722552210596285,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003869080854816975
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.4990454995661043,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004719356768833076
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.3991993603426751,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004509330239847587
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.40620639130610015,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003732849234734221
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5311216931077634,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004866432973979014
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4194624660588917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004505522174144945
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4297560700484828,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037232038393596916
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 11.653311716196534,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1705735396868982
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..712deb72366d8550fbed973cd28a520ea52bb3b4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6521291829493591,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004862083336775399
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4925886684392769,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048436222050010115
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5212796553891711,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038836053434542343
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.37085082947442344,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0047222546332911465
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.28212557152458906,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0043025347256123855
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.2948667808479424,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003866435870905894
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5348811228515187,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004618628192727038
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4069011016263415,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00451821222866322
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.427692905935477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037140323185331395
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5673764064869086,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004712552384750834
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4266064089546778,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004471064283067558
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.45113220970476947,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003652549529693666
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.507248519002058,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33581711795849156
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..70d1210c825e6106981ba69eb59953b62110b2af
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6640191119912606,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0046287852254404665
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.48722206954684877,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004886820654506614
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5250242061127607,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038183444409012057
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3809187248804994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004628751830319166
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2823123595032247,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0043838158850553486
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.2999240110444844,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038838867088996087
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5440068657045747,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0043952553052520405
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4027090518140514,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004609214493770483
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.430984226701193,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0036987359826037795
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5781874081685048,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00449591006527644
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.422950486144644,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004568519660925329
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4551521831250583,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003631473140613488
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.396312569329053,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.3293469366739629
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..513522f9c4f2b86ed0a323a123cfad769026ccc9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6664615175587149,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004810736210585854
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4888785939143449,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0050250180805546055
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.523785247748256,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00396852764381019
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.38655727666760126,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004811990065987808
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2857193152364473,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004475062913060129
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.30152821306111477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003959284720918688
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.548696399374843,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004627017088614486
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4056777694315134,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004679087627002612
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.43158846013751617,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003813577934303311
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5792867412593304,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004732363307721942
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.42303066939122486,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004651572108090147
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4523923440770611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037402755921038884
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.891215677061275,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33496311481577984
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..113002d4dd94b85ede7ab2d9cd4e7700487b69da
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.16545410465514598,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0022997761900446748
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.28561683985985054,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033805688000285673
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.1943147054100485,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002290302577130692
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.03891245662630143,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008607597452070799
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.07031350658097817,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016328568998679233
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.046143499270621315,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009592242142563405
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.1105354395920507,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015190328037893866
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.19933561661994773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025398698192885018
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.13129771132175647,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015070177742292337
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.15358147119074234,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00214171849980464
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2656395423722148,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003155332061428429
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.18042769676325615,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002125927765273512
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.5181046478114517,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09250562496871702
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e2d56c17100fd0b6b4e003f2c12d33c93bf65d5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.19118099408569436,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0022318050478954713
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.30065602094383553,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003003943741161264
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.21191127536224905,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0019609532949790213
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.043480246462308615,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010898262094121871
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.07030911937863445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016278311940230515
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.04782532792275138,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009895310959648507
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.132695679844194,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001577353316550692
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.21276743865231712,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002279648108033155
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1469428089542953,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013001589353820034
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.17835239766630012,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002087403245048929
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.28062460803225625,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002812309149448641
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.19748390376103794,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001814999070191041
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.2923870243499453,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04710829598595837
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5679a1d1c929823bed79c2e787dc07b49b8002df
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.22004296063312756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0027587193644827523
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.3004661215661355,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029546541029262952
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.22370918766321005,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002010231202549798
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.05536118871732041,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014858900012090124
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.07390397982561925,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016332246699294153
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.054327487227033164,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010981648478134627
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.15663926213726143,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0021407976310347816
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.21401909770058278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022276760034737396
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.15758363100687361,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014304910029667866
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.20481968410585505,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0026160792844735293
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.27917387789267256,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002758702443143936
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.20778987647499506,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018852458382730504
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.711904811057529,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09526160140046504
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..601723a33ed49a299544b8656c093a7ea30a45b4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.20203951914040996,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033197172999644433
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.24139296790371112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003407257826021676
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.18732343321931774,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024323481657191164
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.05217979169112856,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016808629240166956
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.060144137552866045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015840842099530614
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.04629662621989341,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011281742136129457
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.14790066047047568,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002654555605999541
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.1738019326166572,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025360210562127504
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1339808748098793,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001742036427781668
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.1879613854814907,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031385903112030257
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2239521824907146,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031803147008683547
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.1737001196246117,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00226610556612505
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.873796046005488,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.13673291688563485
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..37870e1486947302b1628f8ec6ba5fe792e9f41b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.0735717278931429,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0028911131224153295
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.07553229981002776,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002757787802272632
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.060947459186220604,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002112636651398563
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.020903657542024313,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014483735027008208
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.019261648377261246,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010410235853285292
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.015588317769795648,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007937674501197837
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.05657476965321192,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002371018879045927
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.05566946485627606,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0020412553156093018
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.04515352188180819,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015731768240294375
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.06864642701940102,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002737856522478581
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.06985653918253502,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002550011308620696
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.05648029062299134,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019666947501841563
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 0.2200180577672771,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.027482870742044186
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..baeba3c6d846f7e680546c2cca81005f70e3c0b8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.012343062142861565,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0013545796255510047
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.012305611615879857,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0012537625370804336
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.010077141412512394,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009974698986143257
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.0033110648238113935,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005010803065960274
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.003581989670869063,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005441626132486554
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.0028348539794116954,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003871006036728002
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.009684293482664644,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011205772153478955
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.009463327618981224,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0009853253825312287
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.007688930494688018,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007722547402435018
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.011416410255274385,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0012625531153532816
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.011395661121003155,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001165756457145875
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.009289497933603756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009175552178181104
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 1.582389521440466e-10,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 3.0336511902248086e-09
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f44874935c5254cc5f2f03d528d1db5deaf193ac
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.049207080285889174,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001218170848562994
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.07381385524331617,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0018227019727620334
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.05451479621157953,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0012807036115901312
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.0051941806701366125,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00035925475672845153
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.00839259424814296,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005661600346822327
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.005939258116110201,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00038468302190640825
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.04351987471606278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0009955603897709461
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.06622021616649243,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0015711536395290884
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.04839456994889072,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0010565437353728235
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.0459909349232918,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0011148896123208223
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.06933867001801945,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0016979262230734408
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.051006434420687016,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00117710177594407
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.4202315524967184,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07509386796041675
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d3c71ed25c3fb74f1f728951b5910da60c478fc
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.12085269058219979,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0018800465788083838
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.1212329663199883,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00187706234222241
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.10539766483016742,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0014004693299149832
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.009745493152667832,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007924698057957289
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.010021780512207765,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006924096781473867
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.008130044029674053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005061494758793983
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.09639222192782304,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00149350986619975
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.09656042627392813,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0014661442579061582
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.08335270418305636,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0010371033295063308
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.11579018628895178,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001768658177302644
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.11633579095629488,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017821423582159675
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.1010214292579844,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013158243234685884
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.6101972105932388,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06528883196121242
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..76d19fb5bb0d0d0e71b3efa71b68b67b75d16813
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.22250413866904517,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0038252703699176048
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.19070576171594192,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029483914941301126
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.1683928422031243,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022763664965003553
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.0621189927043659,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0023125503328516586
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.04614677867390645,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014257058600775931
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.040944778973814884,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001133602887491379
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.1765987395394153,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0032959155772800614
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.14723158465597147,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022781392441315216
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.12964889700558796,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017255313237463151
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.20961600752743212,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036729560713837975
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.1785620404996782,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027700352932856034
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.15762456715786408,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002130633426308049
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 2.5697392740712903,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09726441182310538
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b1d5cc0aa6550bfecf0acd9fb5b6edaa47c5366
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.23920789042272972,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004451923794477901
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.16718906497079217,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030207461399335495
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.15814533627204788,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024790082791499913
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.07309736843132873,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026804185613156914
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.043829507427460696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013802638969192078
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.042100869824327695,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011891039526604893
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.19334436627613655,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003819886080978273
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.12983200684113314,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002334520345992616
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.12346136179216806,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019112011754057825
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.22602981564601687,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004287782432287635
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.15608600439681367,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002829819880008212
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.14794587354702365,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002325660472000532
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 2.149834523005355,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07337967845410115
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..247a9850521fed4b8665b80206c2dd9c5fafd041
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.08353070569399211,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0035352228648524034
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.05030529429455797,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002119662209344908
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.04952738568586151,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0019248164005399725
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.02788475548850126,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001984709851822524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.01357745926029204,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0008062711899462007
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.013962256327531126,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008076774569833121
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.06988961888688942,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003098125936762339
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.04011808095891787,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016990965796509772
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.03967195287913463,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015485058524769843
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.07917805251121919,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003396133321736807
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.04716680249732015,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001999098077515287
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.0464257712662957,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018095491536731455
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.034389491270844154,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.00795211648185989
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..191f69d4e6a8b0b40d26bc83dccf361f7ed56e24
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.013359898000632503,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001546173211155727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.00743087996116675,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0008730744758598823
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.007524276436169901,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008101397059229119
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.004979282809533886,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008580275451638574
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.0021429531558444645,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003530144211538606
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.002347560172350861,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003520939095930996
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.011124009635183039,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013504382029806005
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.005991212365068976,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007027748270460832
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.006087946245577977,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006667486081828124
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.012927390131069527,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001511374194406188
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.007111949315448997,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.000841252461914259
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.0072146284393324175,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007801116029630629
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 8.014966822272051e-19,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.0374532265072001e-16
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1483b5cae2987f2fa04e73906900162eedbb00c2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.10084728088346058,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016704126782767605
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.11626998129546122,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0017095825126744002
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.09861978093907281,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013312382950616123
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.006952750862086854,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0003742497583440925
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.00825312679525918,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005060847331287482
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.006774058456222539,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003619264283480028
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.09346515420059896,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0014766983553922265
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.10921956551040214,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0015306854518442438
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.09204815585560853,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011610452838456383
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.08684132289392488,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014723630029859183
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.10119415272052232,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0015325967355127004
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.0850753438963473,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011505739554051005
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.24069952714187115,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04999681194672116
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfebe4699248196277a2177b818a59b06ff05b77
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.12570484541604557,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019387862247634239
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.123630356642246,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0018183757555234191
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.10847730157557842,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001363680486425725
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.01104088364395252,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008639559463958727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.009877120720020166,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006063789166641115
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.008501169030695347,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004987236739387054
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.10137716059142088,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016064276788596392
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.09917736982376567,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001424294120823274
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.08658591358168741,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0010363082761354987
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.12102429972223623,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018621970559901282
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.11902518130446503,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017340172425578562
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.10436489995945487,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0012929050919925859
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.6572484010354098,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06678803371648423
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..aea3ad3c1b2548bfafbde0eae55ac43bf53db90f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.2117483237424628,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003673007092156956
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.1807723247472404,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028119380867935155
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.15978364007037885,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002146039638632564
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.05045804846177444,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002077814678823861
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.038071387292995425,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012888149094590308
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.033815746580063144,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010388210907900453
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.16822046314553674,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00309089860213003
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.1411027078757953,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002198021261123935
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.12422850518706006,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016333916321201456
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.2001613012527362,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035246041252274714
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.16951618612440267,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002612196179301279
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.1501340500446388,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020038342663596972
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.1040082038162398,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07459272356146383
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7380900658c5ab61444311cd1910b03815f475f5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.21604655852321505,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004237013318778521
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.15574838747308595,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00290352384129497
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.1454209509941261,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023687960335153093
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.061614289889954106,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024851490831325055
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.037776751473578556,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013168249511316943
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.03567535556584864,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011201666793845067
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.17476206179888693,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003650963432920612
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.12177092267587064,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022748704863199223
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.11395609046047189,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018362494195700422
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.203925907737883,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004055385996449827
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.1463238349053685,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027458312154874562
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.13649183555298453,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022246127803878944
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 1.8684497943291858,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08209544953446724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..553e46f664773ef6a20878b7daefc7935381b6a8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.07275705303203882,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032507746107726535
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.046114009178668744,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0020790831962699103
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.04440729385734818,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018127470127528602
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.022744080016362914,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001828829152352875
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.011929912900514794,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0009066681223803078
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.011240567915400753,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007301610677896298
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.06041777404457702,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002845907268084594
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.03649435423952065,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016477572016587662
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.035245517816726114,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014338608453415918
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.06910110864393543,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031267048764333605
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.043190618850823925,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0019524580787881156
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.04166885497985569,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0017010247269226748
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.023154739388285894,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.005577121563550948
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f3c31a8c9561eb560da4637861b1c763aac0928
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.010296457827498611,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0012918860319172164
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.006758532981021845,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0008379711362413936
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.006737087899764484,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0007910409443146855
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.0036868286820582992,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007834130189216914
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.0016965282718283603,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00031792662046301257
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.0018352478253792217,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003166724369022159
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.008409961437487717,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011155876274861894
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.005373734560444225,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0006740932130105292
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.005318064241692839,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000633253894423773
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.009930592441167776,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0012607226775993969
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.006454884476813964,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008097834237119673
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.006429830165009096,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007604504484812541
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 1.901663426428568e-18,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 7.736570961567845e-16
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..955f9cac7f794f14438ed6456a1f652e6b957cd9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.122915905754568,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0027293510115185423
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1767818917839268,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033841479310980638
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.12988512607981967,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024439343174664525
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.026806616635118036,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008667140586963389
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04199675102883295,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014222033630357748
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03000716115226249,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009354733970545372
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.0938740707460975,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002177005796825213
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1374323483251746,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00266526479478189
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.09879731353636668,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001799103726263371
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.11485464277414363,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002596139008897079
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.16449395724493923,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031654167638919778
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.12094018141973355,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002284523760450248
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.278917009551764,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09140957868815724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..da20bea59cd064190f0e0d7219bcdd7ed7d7032c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2321033086693269,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003444990523939608
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.21218959051209052,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028780222957239345
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.18879633772816867,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002216973861149929
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.05986007346081548,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0021311948977268975
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.048915032760613236,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014864819960528294
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.04395688090556475,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012580561387008646
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.18107984090374726,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028960619334779066
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.162566462675585,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002220609266556534
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.14456965085181128,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017115993134306947
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2192803166896013,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033059632616290542
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19959677205987747,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002694200019026103
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1776723900847922,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020872816698884362
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.707464673902901,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1122035248088363
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9eea86531b8eec5e9c73392b9dc70a46d4a6135e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.3147030364809968,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037689305738243476
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.25097433153568405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028925947424101703
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.23558432539564403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021996425715538554
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09338720402263002,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002368756352904399
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06942686553346078,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016202813479301297
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06557325455887651,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013735424734152642
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.244935161158131,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003198495702253031
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1911576784626261,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002276055119508395
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17976921379978278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001722115310366516
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2958662208042827,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003600934016250172
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.23572659332760645,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002741901482334439
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.22099937310878076,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002075646266476727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.7904798767943335,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11122294896753164
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c25c09358f1dad874de09f2ecbb51e222e27380
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2754936136179268,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004157379347035767
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.20528413095627832,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003149288824376793
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19838663921126778,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002621759128746396
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08256853694746276,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024364087557486665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.05727334409426975,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015675844642289158
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.055968236855402816,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014034751658766104
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.21625894747073696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034653747339702624
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15717871923447801,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00246051792291125
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.15243563853997083,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020406692721428466
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2591405389486435,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003962509253114959
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19231398442318168,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029662256128315167
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18589041350629654,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024658633555113495
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.8632324313744557,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09299178906804655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac76affab2314fe5000a849d555ada9c0bbd6f31
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.09597320792153687,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003523744199921863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.06743171611131621,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0025672971207144984
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06581475266895565,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002308785748398076
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.03073553158490422,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018205157024659981
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.02032186815564804,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011741990653970874
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.019691124849550927,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001032704031652877
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07767834454496185,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002968257043282438
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.05327838136711029,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0020669018184854763
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.05209993435974213,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018508808521006768
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.09035808108893525,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033673742748827878
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.06286918923105832,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002407780221261254
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.06145306117564354,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021697142437582524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.08259030504562871,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0071288759428394894
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2159ef5c5dd9e9714bc78d0a0418ede3ff8e4ba9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.015768279475199848,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015994817709509021
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.010208407683374372,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0011101868759327263
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.00991335048881209,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009630926105753386
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.005050282757985984,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007965820847066382
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.003233565984990156,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005539676407167323
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0029524028339129186,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00041005985526957695
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.013277476442981736,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013881298634661826
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.008616852733661218,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0009635704353861369
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008248037341762705,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000807006653030911
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.014923565167243124,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015288899364523146
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.009697966584991889,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001073756967121187
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009376779842315807,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009193911532830411
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.0184106792465108e-15,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.78802158675685e-13
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7e67391d6c21791b2d28e16ea53510813a1874e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.09716914266381921,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001588873777767349
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.113047966597192,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0018138880339988268
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.09393150748763507,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013913781802491765
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.00789406798092684,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.000419736727273889
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.010369466926148594,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006220741626663643
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.00810320245121132,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004246779834740568
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.08623270741496217,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013189384242570964
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.10103979947244672,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001525640240790577
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08344187952213442,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001134658015336061
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.09230820827944372,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015118723923496866
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.1069999135512711,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017087689994245597
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.08890494751613388,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013055354696445939
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.4520613110187872,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04670340488015433
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd04aa3504b116ee91dc9540935aecac1ef73995
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.1184008680654741,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001564826825421052
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.11934831803746493,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0016574926168922098
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.10497305468531427,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001255055086560988
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.0065474359357168685,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00041243581464096575
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.007144064143658665,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00047460386541255963
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.0060518186828094475,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003723118068721728
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09478143647519513,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001219626359065995
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.0956209796427602,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0012884715653876924
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.0835136406112498,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0009308872181256089
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11382913352428527,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014806869787433784
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.11499167572068492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001581922066092159
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.10097338753997802,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011828254832324744
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.5155792843718754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04989759827988764
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb5ff4f89a3c9648bb7213ff336fe19e983ce066
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.15273932782795627,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0022156125742117706
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.19651328239605736,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002663780636112302
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.1507885545235868,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001834000670129471
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.02517008324499953,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009798089194818487
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.03413193930670911,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001221169425838899
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.024795023268022354,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007930977642022854
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.11730847696393268,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016795313897096911
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.15294841448894894,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0020599858142528278
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.11559317558886968,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013085530731690627
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.14392075150794262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020782027930187456
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.18487493068333727,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002490594176024061
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.14189539617563585,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001709091043439753
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.5716024718768677,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06669078339037757
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4dac940b1e3f90d606a0e48893006e00a72e943
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.13391084968723968,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0028933184881919445
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.15711899898252504,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029594045076376902
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.12179800960613144,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021714004036956338
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.029181772262838104,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014231487802429015
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.032454024258731846,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012888650580723702
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.02465336911318367,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008917747894159659
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.1045676504106663,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002295339185386772
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.12418186583034126,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002364123273303685
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.09436711261591924,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016117593482645508
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.1256412476000786,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0027136444937049084
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.14768417753168112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027908694567627588
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.1142421768175613,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00203307593772788
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.6448018903331314,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08986896028234234
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..594c0400545b6f8997c56f82c38b5422c93ad116
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.03781390643712872,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019725007865177713
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.04248222303303341,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0020576616744248126
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.032778186929640955,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015160232311598606
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.008727821675270711,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008860783473493923
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.009476342855751762,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0008552628886109314
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.00675165993793729,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005128558386297882
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.030251516700860112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016420771541506996
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.03390501778846023,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016639430097277806
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.02576037746303003,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011788924925310038
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.03554070882434891,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018653666178719184
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.0395625119959948,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0019080538751288382
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.03062804358044647,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0014159235675085848
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.03793835793809636,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.004944367898593229
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2833536bec9aa4ff10f5c6b40fb3390d259c7e67
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.003663087080839873,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0006889075640243746
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.004094057107563647,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0006895204034868401
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.0028680994456484797,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0004506085285145695
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.0010185261931717935,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00038254237678243645
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.0009427307765006302,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00024016077148222763
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.0006182627044826944,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0001367325028734569
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.0029725997727319074,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0005844640815613739
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.003399810296798685,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0005797265646639166
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.0023407410922934428,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00036664012211638547
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.0035444907573141614,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0006683251187877269
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.0039682862002560155,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0006700453965634391
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.002768076356007906,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00043244630871936497
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 7.352480577371536e-22,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.69354618200267e-19
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..595d36f421ffa8576cb5546b916d97e4636f0136
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014933117490932577
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015039986742055235
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c375e6971e379f5dde95050542d994cbf7378d10
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014888272588203931
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014830507204541028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..454153333b80f7a0895e70db9541cfbf126c260b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.347,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01506047203170662
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015050266127564436
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..706a97e6d974324efb613abf6636423673689fcf
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014955087918653596
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01507060460376841
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d61c4656c177adaa1b19ec64b4fd4aa63e5a199
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.344,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015029633724408948
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.351,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015100563798316405
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..940a6bcb310d049213153f86ca3b02dec915756c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015050266127564433
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.354,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015129868238451772
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..82215a594617f24eee915934f32bec68e6b07d9d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795027
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01470919305605714
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb303b59e1a6700fcc057b67c78c3046e9ca9d26
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..649c2dcb76308f4021a742a121f25cc67a88dec0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.362,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015204840912919501
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.358,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015167928865407557
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..446b18d277e7288f6c89a93934d61faf96392716
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.351,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015100563798316405
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015070604603768408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..bce85824260537f92fa15c0dbab0585b75da9713
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014888272588203933
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014955087918653602
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..74c6b387200d237260f8712c1f9cd781a9b0fdf8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01507060460376841
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.347,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01506047203170662
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..06efa3630625f880fa97850207c4dd0232543ffb
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01491084616422987
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015039986742055235
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6452b4ed151d9900c2710e8994538e594bd958b8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..362d48838c9b9bc821813f84b806197391c39309
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.364,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01522286884052202
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015070604603768408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b71551aea22b279db3f972712cbd7ec72cf83f6c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.356,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015149042659306626
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.358,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015167928865407559
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ce6ce1e17f5a260dada0d15454c01322c42bb25
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.353,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01512017260548369
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.351,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015100563798316405
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..022ed0d155df917b52fda04ed2bae49ed537451c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.349,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015080663991563098
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.338,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014965960710224473
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..793f3d2cbf201c836479af6ab1f9c8cb99404be8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229857
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01487687202745673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0474c6217d118363c3554c8552af508fd0cd0340
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014818724459095526
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014770821817934647
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a631002b7b83101e9f311e900094df7f2e4bb5e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014794927843348637
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014876872027456732
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cb7565eb01b75012b16539a58e2306b94df826f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014818724459095524
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014853842487270333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..888cd91a99208fc99a46f7cabf8ad904e81b8d60
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014865395385928369
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014899597242811492
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d783deb48101c3c93dc243fe842f76f9a39e6492
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795023
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.343,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015019206922356953
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f720c8f53421cef2d48732f5a5fd76e5f403849
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.338,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014965960710224479
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015008706182121731
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..47b7e47673abacc37a0db21919b9dbaf28ddb3a7
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e368424de118014ff73639d4a419a679d0238d27
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.37,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01527525231651936
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.359,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015177264224798594
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d191267f92e5ad9b10ece203797c18cbc6cd2fec
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.364,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015222868840522019
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015316971293620996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f86024be10c2df5d179f35a72cc213c0baacf020
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.36,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015186527932040117
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.366,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015240612726405756
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..71d8c83dcb37685e27af215bd11eabc2fcccef11
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01505026612756444
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.347,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015060472031706618
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..efd16d5f4264eb67798d5876d4257d0dfae9aa07
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01493311749093258
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.352,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015110404505648666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fa93eb0ee44112c596e35754765b453e399098c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014987482264363935
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014899597242811492
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..de4b1eaf44812b18fa262aa5e3cc4f287b36907c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.351,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.015100563798316407
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01497675877162034
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d2da20bbea3d8e2f06485e41279cadf96f25085
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014910846164229871
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014899597242811492
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9b490ef1744312e95a967117fdfe707f911cfe2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014830507204541031
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01484221315341124
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..71b96b1a50294c8e13711e46a131b8471e7d066f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014899597242811494
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014842213153411239
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..750397c9959fd138f3e185b452192e93eb5b520f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014922019523732958
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.349,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015080663991563097
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..110d7679511d4cb05a9ec86fe2ea51ed130e5e02
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..40d1de15a264f7ae05df69198d1b425ef4e6c00b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014734079309311901
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.319,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014746404865473479
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..660c7724b918cf3a18b8c1d604c557c606f495d6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.308,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014606483127342761
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.309,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014619600977206491
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8aed3e0b7168a967cad8a1ce0e7ad1fc2ec2bc86
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014498627873361427
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.31,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014632638658632907
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1c25a07a6f6e5c8c47b9afaeaee0ace19cb87d6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.304,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014553205687950418
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.319,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014746404865473491
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..451f5410b19c08ffdd0e0a1e59451cb18d252e18
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.015008706182121731
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014888272588203933
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..135d4ef98e0f8215f7701289b86e09332a01c91a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..94c4441364dd70907af6b40f18eaa2759e0c772e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01479492784334863
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014842213153411239
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e148e2d33a8c6d48cd6d9bb4b0fe68566c946bf2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014955087918653595
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014955087918653595
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..452392068455824265881bbc75487b0846b639b2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.312,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01465847437050901
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014842213153411244
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..adf5a25b27583e52d5259e5c5f920973f61a8b7e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014721675438880224
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014770821817934644
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d82b2057bd0a838e04ccb4a2fab02bf7a3b8b1f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014922019523732963
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014734079309311901
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0225a370d331e2f38b599b1da2b4a5a87e84bfff
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014709193056057128
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014709193056057134
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a173ca1c0116cbaea9ea1ba7e9ae0cd778d79de
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014830507204541035
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014794927843348633
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc92bbcb277c415e63cb084a62f10e6ecc0ea5dc
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014721675438880213
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014782913600996681
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee18a0c0493b03be92de8a43477a29a36c6a2381
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014830507204541031
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014876872027456734
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..823529050bbe484d06421a4301b8c65304170e47
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.309,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014619600977206484
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.304,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014553205687950455
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc92af5bd9264c31a413296b5e1e6b08a73a3ea6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014933117490932579
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014944140233795021
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..346132d124c57d8fa29d3b49dadcb5e224f7f541
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..830cdfce34d96244f1a8ad191fd754ee4a5418ad
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014818724459095522
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014842213153411239
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6e5c19920ebc164ff127f959a5fa7b431fe601b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014806864733738863
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014944140233795023
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e241c5f4e37987f08282d53a52cd53f9c6f99947
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014782913600996685
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01491084616422987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9edbb803fd56f5a6e0d5c8f542cd13eca62143ff
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014734079309311903
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014770821817934642
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ee4151e0d2f9f776bf4ba0e41a18017fcbb38ad
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013639261190932879
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013588208070708993
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..47de473612b440e7da448b3cfb9cbbf1a4ba0e87
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013605417345710528
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013544340907003663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e662474fcd355a85e367fc072b6aca21ae9f789
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.33416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013622434813136778
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013630871843821477
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..39cab629d063f11d42099ee2f1f96e974c6a7780
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01363926119093288
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3433333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01371263383046586
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a72abd287f9ce1572dba67bce72e8dd3d030f6e8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3425,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013704669762934728
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.35333333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013804572162314925
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..51ccfdcf8288cfda9ca004412df7443adf689a3a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3491666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013767075395077252
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3466666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013744022550571946
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..60d2fdca946ae0fb05246b376c63727fa7e2fff4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013630871843821476
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881627
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5e4472763e6458d0d5746df4cca1f702c21bfd6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b40fca77bff067cb0cc86a23dea6a0499df97e1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3383333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013664144006618266
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013680495725767797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ca124a319b4b3e1524f7ef075aae9eb5e7a33de
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013588208070708995
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013639261190932886
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1656fe246bd9e9a099680bb37336691ab3ad5877
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013588208070708999
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.31833333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013452948996996296
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..65f30c045d95e058fd2509b023225d0d83804d73
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3175,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013443538681348052
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.31916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01346230971200513
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..885b4099a9799c4841769834b1171bb820fb7871
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3383333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013664144006618265
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013639261190932887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e19b9a8ad7cda2411631be7f3a4606c48e970ffc
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..639430639e012e85d92a03e7a0ea49338ecd3481
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3175,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01344353868134805
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013415009084004862
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3eb4c7d0d40011bacb893601fdf7d67b55e03475
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3333333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013613950010225603
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013544340907003663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f7a2ff9029cc9c9c8d314af76ec58417f2d342d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.31833333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01345294899699631
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013562032919529019
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec45935764c5a710baf054d71b6ec8f324ca6ef8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.31333333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013395739415639082
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881624
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..99c789c5a9ec83e6d795ef0596f2c68aaa0429dd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3275,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013553211167251954
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013471620929769144
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9aae7d87c48096204f1a69aa29a422f4261051e2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3333333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013613950010225606
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013544340907003663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e67bb5dc24759f46b7629c8e543c935ed30d1796
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013508372867300215
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013526454480351021
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..71fe142df22c400fcf10acdf328beaaf1f97c4ed
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013562032919529017
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32166666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013490095282989521
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..93fc573127fc78a508022e67a08bc9bdf663450c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.31833333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013452948996996296
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.31166666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013376268790982103
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..35c8f7fca86aa3b1548aab45bfa8b308cc103420
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.31166666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013376268790982098
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.30666666666666664,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.0133166423190707
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ac928dd8ff5b6c12c35368a65cc85cb7497f620
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.33916666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013672343491681819
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013570806258433628
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0588dec18e1b77153a622cee4d38510a813dcda
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..18f0f25cc81d98d65ea644021641b870d7a4e240
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.30583333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01330652625583115
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.31333333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013395739415639082
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a07421bbe3f6030ec2fb4942eab2f7c7d5fe8e8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406401
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01360541734571053
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea0f9f49f024a4ee2882177d8ccaee63a79d956f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013508372867300224
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3225,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013499258621103247
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..324b122a912269c22cc84c835fa9b1831f2b5eb2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.305,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013296358936471115
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.31666666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013434078660827393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aec550f5cb3a4821d9bf94a78bfc6101edf0ed6b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2226962457337884,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012158314774829931
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2226962457337884,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012158314774829931
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d894ab60bdf93c1a1ab9c7e68bd3beff9714f1cc
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2295221843003413,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012288926760890797
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2295221843003413,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012288926760890797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ded472c50bbd49fe60ad72288ec31c182715e265
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23208191126279865,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012336718284948854
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23208191126279865,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012336718284948854
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c318420bbcd811bb9f71e6cddf663b184ed81dcb
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.0122728535825408
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.0122728535825408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b0dd56840c41f0be24f5ebc0ac6ea10f52f08b6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23208191126279865,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012336718284948854
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23208191126279865,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012336718284948854
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a8dfdfcf95b52b4bcc5ecc9151631d8bc0bcf1f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23208191126279865,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012336718284948854
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23208191126279865,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012336718284948854
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..806ebaa5d3cc69356022de6d0e65adeb3517a502
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2627986348122867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012862523175351333
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2960750853242321,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013340916085246263
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0493fcc3d215b32673a429e1a4f20c8e0a7cfa4f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.25170648464163825,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012682496334042967
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.29436860068259385,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013318528460539427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3037d563a5da0a796a41608f9a42b301f3c995b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.24146757679180889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012506564839739429
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.27474402730375425,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013044617212771227
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..da9344f7c285bd2d140859f7a836922db33855d1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2380546075085324,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01244577002802621
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.26791808873720135,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012942030195136432
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6624d170ddbe14968fd1b47551168eb88f9d157a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012414960524301827
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.27474402730375425,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013044617212771227
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f843b9b26dbf2db17b7c7475a7affea78a86be4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.23378839590443687,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012368225378507156
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2687713310580205,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012955065963710686
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff513adb803e9c4073694bd3c8475e93f17161e8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2508532423208191,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01266819862131543
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2568259385665529,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.0127669237941168
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5c1e643e1c5d224c47f5d6572a2bab24e6b7005
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012256708602326907
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24829351535836178,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012624912868089762
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2cb2eec86d18f0e4de797a15ae210282ccfd90f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2167235494880546,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01204015671348119
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012256708602326916
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dffee7b9f2c8991a37c4d72b935f12af16017aeb
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.22610921501706485,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012224202097063284
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012272853582540802
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2d4a018592f0d8625f6698280d48b7d011564a1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.22696245733788395,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012240491536132872
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24146757679180889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012506564839739434
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4dc9fdd4aa6e26cd29f81178ffc3e24223c90ca
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2235494880546075,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012174896631202612
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01230492841874761
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0466eaeccba7739fcf5cad7a1daae0a7b398d64
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22525597269624573,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012207839995407305
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22525597269624573,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012207839995407305
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b14641ee7cd0b3eaa0b7f64c91a2f339bbf78a0d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22696245733788395,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012240491536132873
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22696245733788395,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012240491536132873
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e0b402a3145b3a60a485e64c081576852fa04cf
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23293515358361774,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.0123525070426174
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23293515358361774,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.0123525070426174
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff9752424e440b3a156791f7184717baf0082e30
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2431740614334471,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012536554144587094
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2431740614334471,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012536554144587094
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..375b93dba907130dd18c1aedf6b312cc63be56d2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012414960524301832
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012414960524301832
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..458bc8028979e431035359b865e8efe7b7bfc2be
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012476304127453963
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012476304127453963
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..390afe91bdf384756a67dead9b46db3b2df25af8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2593856655290102,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012808273573927092
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29436860068259385,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01331852846053943
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c110466819c7546fe0a3a4050290be44596fd2e4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.25597269624573377,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012753013241244518
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2935153583617747,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013307250444941124
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d43c27b9a2d03fb6c47718d9204d702a27f4c69
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2568259385665529,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.0127669237941168
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2841296928327645,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013179442447653887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac574a597985f62bb63c40791a0a57da5a8689fa
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.26023890784982934,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01282193022511255
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2696245733788396,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01296804068686916
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a89093f34b5e2345aa2e3a7f44a5e9b5ba82d85d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.23890784982935154,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012461071376316616
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2790102389078498,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013106784883601346
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..73931f877af0fde0debe7b8d13a46028ab8ae0fe
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.23720136518771331,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012430399829260847
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.27474402730375425,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013044617212771227
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..da930b6a364d8aae9e29e3c7ea946ad3f31014cc
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.26262626262626265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00902986177676375
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.26262626262626265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00902986177676375
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..371a0e4ec47c99f0af7ea608d1eeb6667c4b9670
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24368686868686867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00880917174472056
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24368686868686867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00880917174472056
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1852a2cb624225c1adc4a77d95e5eee43a34be0a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2361111111111111,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00871448049171129
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2361111111111111,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00871448049171129
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..55ed5faf321015148d64a425bf860e218ba94a24
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24452861952861954,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008819461106822605
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24452861952861954,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008819461106822605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f58a5d4f56809fce976e523d5053bbda2b4edfda
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008844984581934896
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008844984581934896
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794ee36a7b08edf9f1c8bf4c78609fd96e95031
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008910024163218198
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008910024163218198
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd70fe8abd573f90949e03611888d0dd630e35a5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.359006734006734,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009843424713072178
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30176767676767674,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009418994158522521
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a312eab0524d679f94268daaf8311f14ad0ec9ca
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.32575757575757575,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009616642976885977
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.29924242424242425,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009396447162309824
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa1a697ee2fca9da7a92a2af80ef4ec8f76f8fa7
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3181818181818182,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009557408782506376
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.29335016835016836,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009342508331708558
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..85298e7a740d58d4dcc542f0dd381bcfac69db11
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3194444444444444,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009567482017268088
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2958754208754209,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00936585413414006
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..520378f8d316c1ddb0693f17d9e255c0910b88b0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.31734006734006737,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009550648343947768
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2836700336700337,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009249781691140744
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d067b3970a5ec62dbee715d47d612e228b807fe
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.30934343434343436,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009484615220606831
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.29124579124579125,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009322788837938863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5ecc694c628ef6ab22db31903ed4f00db1d9830
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2786195286195286,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009199329195026362
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.27104377104377103,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.0091209197417606
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..900b4db6d86bf447376eb2eb3d853a3e46e569a3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2727272727272727,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009138630726364233
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2760942760942761,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00917355987383526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e63c10ea882b1886a7926951cedd87e1f28397a7
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2777777777777778,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009190779909649912
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.28619528619528617,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009274470774627728
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d11912f480101ef8d392136fdcf257a5c558a9dd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.27146464646464646,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009125362970360623
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2765151515151515,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00917788010146828
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0efb38406377e010b07f58efdd87917763877db3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.27525252525252525,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009164888895174743
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2760942760942761,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00917355987383526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e8b2a5341c92e0a840290553fc5cdd712198a37
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2676767676767677,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009085000147099363
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2727272727272727,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00913863072636423
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d180cde55a7d02abaa391c9ba5a0c091858e8a8f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.26052188552188554,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00900643589033659
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.26052188552188554,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00900643589033659
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..527af006204c16a3babe7d721917828d746e977e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2441077441077441,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008814322157999387
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2441077441077441,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008814322157999387
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..aab98d074b9829d85b5710f0eb0ea5f9a754f888
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24326599326599327,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008804009846865538
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24326599326599327,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008804009846865538
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..23b005ec8735acd3f1939a9992eec22ba03b4537
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.0088449845819349
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.0088449845819349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f18141a8effc4679322240dc0dfd2dadf7a6c97
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008885233166386385
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008885233166386385
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..38956628a2e4c4e709c00e1a6f52770a1f7ff0b6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25084175084175087,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008895183010487386
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25084175084175087,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008895183010487386
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7590c626dfc68419bd085db0a1e9074d71c4d3f5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3367003367003367,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009697166595752467
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2958754208754209,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009365854134140057
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..68101783f4969493fb1ee99264b9e01b4cf48d02
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3181818181818182,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009557408782506376
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.3005050505050505,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009407763090599316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1495918bf04c8f3b544a686dbd9eaf39694dfba2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.30723905723905726,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009466688832475378
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29503367003367004,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009358110551087425
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4974ea8a6ea10b623e668a947be38d72a0febc74
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3106060606060606,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009495260551195607
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2962962962962963,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.0093697115856843
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..55d037a81435bb1d9449837b917ba7ee506e3633
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.30092592592592593,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00941151619378719
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2925084175084175,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009334649503078416
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..941fe51a3cd72354e5d68878ddaca64d68544512
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_arc_easy_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2975589225589226,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009381226721815537
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2878787878787879,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009290733161670159
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..62a1baa691aa4dc1a1bf1afbdcb14fe5b6971c99
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5886666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008985524690229497
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6213333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008857326053368308
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..41994a21a21d725ea872cc4ea721aa5c4a3e8e9d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5873333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008989877766895466
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6216666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008855801251873015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1891cea9d55650416d1ec434ad1b010784e9561
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.564,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009055127374988179
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6013333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008940758594209426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7cfc4f897af1422d901e5b53b6848d7a8c64986
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5533333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009078141663938732
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5896666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008982215188519148
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bbe8ef6c1a6d28460fe5442f365284bc6901681
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.551,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009082611478924389
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5806666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009010624844204292
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1a6a9233a3ea983d259be895b743745f5a34229
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.546,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00909150987738651
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5736666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009030591966818144
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..053ffa81690ed1de010f09eaab152888a5904b5a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.621,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008858846410222197
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.4493333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009083233528874787
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f62cd09bdb78b0b7840222a076dc7e8f4fd86c50
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.568,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00904540065950836
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5613333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009061278956794627
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f52a05fc8366bd39cdc5e6446f7ec19560b1874b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6133333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008892593055774285
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.6106666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00890378508047089
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..05c872aee77758df42f5e07dea7a2351f082ed89
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6183333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008870849530787626
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.6173333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008875277637761272
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb23d1121ac5f1765983194600f00b4a86b72da0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6216666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008855801251873015
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.6216666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008855801251873015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a9b673b46bf56aafaef01122e2d20ca05ace1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_after_reading_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6226666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008851200156534388
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.6226666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008851200156534388
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8dd1d963b94b9975b58a7a1beccaa030f7f7700
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.593,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008970906255948518
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ddf2129e2fe1598d4ff2065e181f1a4054fbb58
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.573,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009032396953831089
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5613333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009061278956794623
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d10339aef10c1bb27273e11b1599167806af8f7e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5823333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009005596833757835
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5753333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009026006087500427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..41d69a280ba2d64ea932b8362611da62e2777ff5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5876666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008988795877959723
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5773333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00902036441484364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..48c338b5d0e20a43ff2510ec88b084b5b05049d5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5776666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00901940941590418
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5713333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009036836097555087
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b65d3e888b8af321c61866a424cdfb55d24fe7e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_exercise_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5713333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009036836097555085
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.567,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00904788859878573
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..483d068fd4ea0f5553e88ec1fd2bb964a119cc4e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.605,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008926639623340282
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.44733333333333336,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009079439381402944
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e0281ad2d8ed184b6e2e6021c0479a9668dc925
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.613,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008894007408882734
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.6126666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008895417372116205
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9951e609341dfe06d5f477d9ffecec993c6c651a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.6116666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008899620943397689
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.609,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00891063782727302
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0afe1be982769131696a4ce0d78c75818dbebff2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.6143333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008888323636208591
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.6096666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00890790983863795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..01ef2fb0e5dfa995d19dfd171cf7a4df2614c3fd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.6146666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008886891702571046
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.612,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008898224137298402
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..38216166eb55f3d6487d6b86418ea8c70ecb2660
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_valid_binary_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.6166666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008878207616769261
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.6146666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008886891702571046
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aabbc954301ce51b8fecdbebc0715725bb14d646
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.596,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008960362494453696
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008846558976258922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f7a7d404108f8006f5d2c6fce4dc03f46f58bf8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5403333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00910047692710895
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5416666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00909847370190195
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f9e96a2b02f9186ff9832cc8a77d566e0713cfd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.585,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008997332048705705
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5883333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008986619341172333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c13f646f914a9fe1c3a2d1d8e577107e2cecf71e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.604,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008930542249025198
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6076666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008916041436343385
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ae48583d1b1467d98ea604ab7e847bca4dd3926
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6026666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008935685051576499
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6103333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008905164372580982
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4969a2c66975f0c8b79c9d191fe4a19b9660c2f5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_boolq_yes_no_question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6033333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008933122315228997
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6116666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008899620943397689
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..424f4fbf67f7ae0fd91c709d807b7eaa5327dfbd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359538
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.1940928270042194,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e18cceb438637319d33ab1d7f87c2bee0a47b974
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.35714285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06460957383809221
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.21183261183261184,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..736f3628b9136531082bb86713d21a0c6978ec2a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.21400304414003044,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab1720de6625aadb132b46cce3a054d81058f7c5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.22946009389671362,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f6eb7446d150562d38334e6c951ad5b52a7c8b1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.230804179918219,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..368b94560508bbca737e1f3f10a85f79f0fb68e0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_GPT-3-style_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.21400304414003044,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfce81b501c2f6bd6465db2d8ee9664d78d2d6fd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359538
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.1940928270042194,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..488a804be35ecc037918e5863a6a0374f294c14c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..57f12661e1ef990c337a8e8ca72a029f247b3c3a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.34,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..29aa437d5e25b764deec10d183f473343b906496
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5357142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.37777777777777777,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8938cc92b1c4da60683153c567760695980f2de9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5357142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.36377708978328177,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f009dd6d86031626e1675a3270497e46bcbdec22
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5535714285714286,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942395
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.35643298415256514,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..72cc79651de80967c33c54f59897225f6f39be39
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.5357142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3764875586007934,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..36e3beaf4770de4a5694ad6f338de2fade762953
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3950558275a8a68a00086349d5320ff517f75c4c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3466666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..06a550a42b721fb6929d7fb6dd325ce0950446e1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3645231677576691,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..96d4a6955532259d26e14af6a43a1db512a3b200
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3587301587301588,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..67b275d3a9b7866518822545f7119ccee46e1ee4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_can-we-infer_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.5357142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.38165374677002584,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c650325c43b2e308d4a6d3a487b40f77cbc4ef90
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.23214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.056929390240001085
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.1873873873873874,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5445b87e21691e5da4e956efe5f200e5883fe5f5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.17857142857142858,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.05164277182008721
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.18229665071770332,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..faf23a72493d608c8be02029dcfd3ab2a6fbd99f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.26785714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.05971290310957635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.24590576971529352,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..71210b8e121be5778f062c71e157fb3a75828cd5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.23214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0569293902400011
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.20256503424980152,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4732b187afe30c9d384c7a59a418375cad1b807a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.2857142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06091449038731725
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.2244393241167435,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e39ffde8f001120ab88da703ab714c4a026ca25
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.26785714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.05971290310957635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.17522768670309652,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..676caa9361d945240fe929595973dcd628c44b1b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.625,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.44338000491279783,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..67e1ec1473e0d75ec032c6fc27eface6b6a1c007
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1da9ad97239de1af257a08819c18939dfa7de781
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3494339622641509,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5fe4c8fda854fc84c94294da6075452c09e41f5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3538011695906433,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..14d82984adf97e41062a174cb6608308e6481b53
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.5357142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.38467432950191566,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..54ec5de0dc7f5bb0de8276cee092a5d202e70a46
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_cb_justified-in-saying_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.5178571428571429,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06737697508644647
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3583333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8548ba3d14f105a10d63e435d630a00946e85c62
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.55,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4f5090054a8b50c0d85089ce8ec59d312ad2b1b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f0ce8090c808a79e0f126101ecbca57af540e13
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050251890762960605
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f636c8bcaa61590f20e65816c522ef8b015ff518
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956912
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5dba74dcb48916b68b4f4225771936bcb2cfdc1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050251890762960605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..71bc78dba675ac44458ed74faff90329d90969ee
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_best_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050161355804659205
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac2a01a152e95a497a4abd153e2371dc86134c4a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.64,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04824181513244218
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956911
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9eba144437fe1b155e76b9b4135990a2bca825c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956911
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eef4ed48c798bcd342cfad061c866bbe0acaa191
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050161355804659205
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c498fb80d2ba0aee763d08aa9b4926004754e3a3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050251890762960605
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..763646425ebafe0abc8864ea57ef3c6841f9e71d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050251890762960605
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956911
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff5126ca0295a31be02f2b0139992283fa321a7d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_cause_effect_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050251890762960605
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..078febc440695c106aa38593a393197cda83ab83
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.68,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.046882617226215034
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.58,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04960449637488584
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cccc9e486eb2e0652dc824b1bdcb5c52833b9766
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956912
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd0f575cfa6ec763a2631cf0e27fafe9002f0621
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050161355804659205
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfc891153d21dfcf7dd4e9960466c779c4495439
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f58efd942b41f6e29f9a12040963456a41e969a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0973ed47ce0437f28677f0dcf092f9ff6592bf30
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_choose_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..464878d3e4656567f0554c7014819b26b14e35b5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.58,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04960449637488583
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ede8fbcf9b00277a46672c0e47036b2c4dcbf641
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956912
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956911
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0a617abd13bf2a08c588701cab61299fdaca495
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049999999999999996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a63ad41bc34ae3f46b0a525ecf07aa11c464dbaf
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049999999999999996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..568186d2dca4ed029846cc9a5fb65ceaeb325143
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9de198ef65fc2af0148b04589b92588a866107e3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f75973097067e3159f584a9946cef55c9e77822
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.6,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b13ee7d681a7c9f5c5a062f9f0767a606b92642a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620333
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fb34a2e1d938237a09f4b740b856598a31ec379
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956912
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050251890762960605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b32e711f9865cd86dbf32238c18b2dca6a0ee9ee
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d200d0ed3422f3c9dcd7cf5b6b7db7942b3e281e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956913
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce538c4529e6ab3582c83fa658ef9543c0086f0a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04988876515698589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..49ddb6a173e1483ba5482c27a6a0485c330138a9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 2.893181093567166,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.050771258383273614
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.22721101652903636,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002174631786151447
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.37723322238831786,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029605308604336612
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.273629646761964,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00219562740727204
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.07640865030189278,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0012649024640329508
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.1250175906265568,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0019464934211797466
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.09100215554293947,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014003878631314351
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.18768076986646293,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015662913202835055
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.31630912991285143,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023181337659267383
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.22738917572581716,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015849122049742528
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.1945842233274229,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018735455393389011
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.32636244393347086,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027067207730447885
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.2352022676875879,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019310924562291735
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..858812a4d6501b486f73a338ab4e6c4aac9589f7
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 9.399402743953457,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17924144077003742
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.47386061055231526,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004267753730229592
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.38119947510045676,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003610048652083222
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4002878107328038,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003311711914052721
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.21711896294585206,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002796447264323068
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.17110129874947536,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00224417594257099
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.18025992915993652,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002178035933267876
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.34515720589173515,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003487240459637474
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.27473181051861545,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0028013542645951507
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.28934035658943014,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002625512046929914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.38735824622409254,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038432739893741875
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.30946758998472124,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003145910771718272
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.32576004187947605,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0029528457213781733
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8506dca7adb784a4018e163cad68202477140aaf
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 12.155518015461812,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10084300095584796
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5512470663299613,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003380008431651804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.44400338391624267,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029538365913921783
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4675919630941539,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002380211778611894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.26274066794234024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026741792596663163
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.20911393627659425,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002213438817982695
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.22010440502587322,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020527201506163662
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4042733522247706,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003014974305975856
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3236197121255908,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002493240331238564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.34129076382732576,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021490279108278207
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.45311242528524515,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003250337053382386
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3632843159031807,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002717180891099093
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.38322006389684193,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023453120078189096
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..afe10232c69aa3b5388bba2ee0a90e375a2bd919
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 12.818480424555865,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11880096488769476
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5716424958707295,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003245966702252894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4533042534124351,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028426294428620666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.48141444524720783,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022227475636040234
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.2761805127749399,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002656621555462259
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.21740253682265098,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022432953731971084
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2302982218465413,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00206470098719534
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.41989832427018686,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029524344642488424
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.33057724250995923,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.00239153458363529
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.35184243194966225,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020609113731491014
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.47142486480907,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031553751764853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3723049045197885,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002645921584578156
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.39598482328679385,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002247763634503275
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f9b9257203171e16de5934a4061f6971ebdc8ae
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 13.023419310896914,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.18787640339568382
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5811755264625634,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003279952136584487
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4531163332976295,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002808885692946532
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.48507558149893437,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022388986956659146
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.2840273053174427,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027473207641567276
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.21849814391644223,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022164052552589037
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.23379683139470345,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020763058969761714
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4270926156453398,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003003901373753685
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.33141755050914024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002423830245139234
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.355063998866477,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021030562380576637
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4795421964847328,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003223552047844655
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.37256874707266874,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00264137203801963
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.39927685598165713,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022893014461836786
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc019c7d12b20f0552452814dba4350f82a8c50c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 13.13874169567919,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.15764383116806005
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5894664795975357,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032849183519925037
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4516858553529017,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002776001645788723
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4886001619644043,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00224088818064398
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.2911074453008736,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002754346176578065
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.22039619320267384,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022225995765214612
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.23823194648216817,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020968310082903024
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.43258691970018376,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030113124612643483
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3295502069878702,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023782291443152845
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.35702681875777975,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021061422127890642
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4859598571636165,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003236922862238485
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.370994090640816,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026190329216342714
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.401781353582733,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022967926955326317
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5085e6dceff23f18e315aa2fceb70885ba11a25
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 3.0806537976803305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.08654959981418281
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.24923430615693465,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002552336287648743
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4156926007465285,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0034726402924926524
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.3040771492183391,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0027859763603378437
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.09047270537738747,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0012069986829003393
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.1527487489689877,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0020328633725306884
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.11078262934301955,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014268073506531276
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.1751861657827373,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015166532602682921
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3033207669558517,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023528098672245283
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.21656571042527803,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017004253286561153
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.1972457019261279,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00203359756864001
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.330657714477746,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002879107948785049
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.24105076727059485,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002242976032324967
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2be15547b39e4cf50a744f6020eeff9742dbd24
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 9.677480998476096,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11714095702469358
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.506140960041348,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003945234017119319
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.3944081946120404,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003355916640555891
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.41881888025365843,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0029474810209025957
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.23089703057281993,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002767397153643465
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.17599343128586165,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002162757026079701
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.18735030535151137,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002076624224100413
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.36929998823109533,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003311934706749638
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.28430568800970424,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002617795785800394
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.30292303705529133,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002380196460573085
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4139933304264028,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036069187091296155
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3206920362797161,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002946183435512405
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.34120550977343506,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0026792347942173355
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d77894a5e1d1152c2f993e79773d6594570236f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 12.05612102268846,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11261168026208578
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5563180574898297,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033097983809663666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4433284363370441,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002894846309829138
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.4688987306368925,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023019227203330225
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.26400701514420555,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026576362464051425
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.207135141008292,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002157896147095455
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.21917697225253102,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002001502790757259
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4084452661546365,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029981863130726506
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.322698679438782,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024106169595112235
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.342134715446231,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002081984836426825
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.45805476385316257,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032136214215560794
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.36275138594783457,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026398678390256143
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.3845332666730455,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022694145068922577
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec85a6f5fdc5de05ae8f2a9647bf9d496fce994a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 12.809308859394257,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12012089885991
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.567197996394036,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031941635563547547
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.45496464714534934,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028587538099377643
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.48064245798959665,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022104878204895783
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2713808301404379,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026621379369616456
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.21540061776410618,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022399829182060465
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2271609238297581,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002049696551360146
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4154154613540611,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002920856843326935
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.330654493573185,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023883139342088418
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.34999974120577987,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002026696417621307
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.46722317705643734,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031218760227686975
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3735607439910476,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026634600896822715
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.39498527306923364,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022345222837571576
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebc0c58b52ddae3814ca976a9e3bdda54ef66c51
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 13.080220013614857,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11160584714298007
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.568344864604142,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032244220520078275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.45346556859050147,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028165310549028376
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.4809596449905771,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022389806016051804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2747057715723681,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026868813527321516
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.2165017740347286,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022132654915194188
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.22940323349641095,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020525835595875536
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.41712827628191324,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029432229582722754
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.33039201411660385,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023910452617168718
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3510224338515258,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002061496441416912
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.46944298374040827,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031624401569722853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3729547665117702,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026487677764114458
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.39611863123059127,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022799004681333724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c04f38a5492d32120aff5f69afe65f990ca0082c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 13.126754982894552,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10360087872116377
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5800244423214129,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032430845106271576
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4522814700737286,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027480362924234376
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.4854500633563135,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002219496097131078
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2817904995507888,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002690162257773001
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.2173596999234223,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002187185451199481
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.23322234263569483,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020699168576186902
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4262225377338777,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029779169967996694
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.33076771892860724,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023724569222499923
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.35536850445962775,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020947916034111365
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4805351741121597,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031782540412465792
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.37371143448503646,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025800203198591233
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.4015046168561641,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022668461919812784
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ec8dc72afeacdcbb939ac3200b8dda167e53186
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 0.09272307710118581,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.015507299531691698
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.06857149404466026,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002714756076276083
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.03009537274390525,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0009494627429628092
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.03728897813365394,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0011576994918079029
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.03172208574045067,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016960895743799115
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.012438566317548796,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0004694763784147565
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.015494426267291282,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005915200199122902
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.06643255860316054,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0026515653043268214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.028870230253503604,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0008942016978715927
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.035838656895578976,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001095355689892508
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.06575040740426258,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0026720246479516457
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.028116552151616833,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008796291436374449
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.03502489652281287,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0010881026456914845
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0c31ed52ad2b1af9ad624fea4377edb9bff30a3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 5.652249507377061,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14061164623877323
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.31222666399245746,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0038409642516790284
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.2665121107894025,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035125435383699653
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.27298366766974236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003251481554695709
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.1333588804023989,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0023201775029695754
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.11363089991243877,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001998959577037935
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.11595469608797514,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0019122129455938347
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.24182600036670712,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002834935214515838
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2036428358852747,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025389192347628087
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.2097551265064886,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002334085005475068
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.26033955477578685,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003317352229551968
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.22088539882783978,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002969599084643113
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.22681833491189718,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0027772963952946224
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae348ed10be23026696a03c299bcdae0cf428d0a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 8.858168867127539,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.15358413684529032
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.38944127465466044,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004503183294613872
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.34014174486376614,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004015461323957741
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.3444247303322821,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0037208321217199283
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.1807977835141012,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002801765538636182
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.1572154217818733,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024179638680786484
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.15850574855645186,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022819154552772038
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.28782843422256305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003320592623914537
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2498454599165124,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0029373372282745012
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.25349129697286293,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026956970102644516
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.31914692411759643,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038546708339040276
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.2783358271015542,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0034103422656029617
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.2819365226348762,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0031783863755039652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..198fe05d9e8e150836e7a03260fae26dfe097484
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 10.35853514789337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12387147878155497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.44248133468645384,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004497810429174453
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.37990230663646296,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003927199237573467
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.38990963042752075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036858837285380693
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.21053296232652152,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002881133238894378
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.17980536378141043,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024724275105467686
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.18411460451025186,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0023707420345364556
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.3187308429757642,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003373786617903847
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2728727940909337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0029399293650148287
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.2800852266792753,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002743206810693273
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.36059219853066166,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003917787632492712
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.3090446652384135,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003388294524398576
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.3173742799146112,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0032163177711543606
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fe2c7c34059808b176ba7b5b3d647823fca65ba
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 11.515556371710902,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1521281614192603
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.47723640823076796,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004287267359218536
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.40660753931112564,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0037138867542392297
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.4195774576048078,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0034528003933074034
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.2291073642884891,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028107927329324915
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.19454563878113562,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024341427164360544
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.20036043048947655,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002329405952733126
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.34087412726024163,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0032590582635772347
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.28966452741743237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027984443113154285
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.29891705821656117,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026005964258569996
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.38895612444514427,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003776148236077503
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.330648039857798,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032240413111297886
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.3415819675283937,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0030544859332802426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..aec316214843f013d88d62210e43b8a38a66ac64
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 11.962882852349749,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12492026969156166
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.49857309965795027,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004095692107512742
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.4195162433312554,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035246726867291895
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.4373024691627277,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003268691629876463
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.24035982989846097,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027975432302982687
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.20137381546159938,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024063007560350337
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.20952873474512598,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002313066132505112
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.3521991129833718,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003123615311072105
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2963793856785372,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.00270044093552581
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.3086640751036808,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002504229241203423
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.40439342888738805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003612123346774531
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.3399434796164529,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030955619182597956
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.3544764119864059,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0029181394624907212
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5b2ad5f20f64e31f634443811140a88e1f57ac6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 3.205452750701432,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.029369723752371055
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.18794203900806028,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002509182484953893
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.27563499652889517,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002522516313059265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.2026666683966741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001854044988549804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.04736777911623166,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009912253924337147
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.07529125273517918,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015129136922472945
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.054594037237788585,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010529893200416562
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.16091347621914998,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0020468972719200795
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.24052353002182375,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002110543832931725
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.17465641737008017,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014217853180858962
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.162408167198493,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0022254242193644625
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.23809586470877792,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024056082119999767
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.1754467264102351,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001775758147384932
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..04d0f7288703cce7f291923f4d1a868a8fc74478
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 11.314986164630461,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.09886007247543747
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5405367881450536,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031944831139177473
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4265799747472657,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030051178365182673
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.45168976701280994,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023803410292144262
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2530645509899577,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025820188553003945
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.19690559018039472,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021546256249304957
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2086255432248638,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002005283814444169
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.3941062713485247,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028671074248121063
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.307687838606323,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024236181103855607
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.32671702545878634,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020575751841782584
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.44239922810973775,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031220549443630044
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3472525108890491,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00274280048405182
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.3682903569907333,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023311514221899116
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dc5d6580609911da2a0002a11d4c714ef54e7fc
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 13.825382196007423,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14341892864265515
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5584732398385521,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031726923154704037
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.45947280161409504,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002947956873468934
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.47949895116223057,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002274496687632488
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.27533931731718275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026239982120668894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2249642538757707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002262594422709972
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.234132554236615,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002050727643013208
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4144330916795227,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002865466880028744
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3391232820910882,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002487675496155266
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.35427830623229684,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020644801949430084
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.46586823562780794,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030764860572737814
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.38321253954384926,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002786697010998289
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.39980527255607845,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002302036382207211
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6ab311f82e2c52f6eada2b47d5adb66e9f3e794
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.734710333150248,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.13327591335822025
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.566363810944366,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031329176207456266
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4738543349642313,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002916750104399688
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49284383237521995,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022741384691834527
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2835262882274797,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002620037892529304
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23582515668443185,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002307150627998116
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24473795777586516,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021021762412204888
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4201683174467468,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028464798891116223
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3504188695034407,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025288933574772144
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.364532741901036,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002119688594264684
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4736089581434219,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00306065010387841
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.396405785185109,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002794473833800316
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4121332244451046,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023416760964820984
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9ee1529055b67788a62a7bd5ff66f017e700509
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.080804791535655,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14821971591156646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5624928789321237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031294366446319713
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47592863926028084,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028908116130151665
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4935313098839454,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002271845302152271
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2834307304449863,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026187903085809257
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2386118604584549,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023078935639056814
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2467411297660491,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002101346812765226
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4178299399327378,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028182273866756095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35288616277467044,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002503800280122615
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36587999086871936,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021050945906898252
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4726074434579816,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030529008210380413
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.40030956626841174,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002788731746840601
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4149830902149149,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023565846719075163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bb85c0c290aab2f07e44e7a9959707f8de851e4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.265600470980436,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20285527803005138
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5617758997558588,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031160645760995214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.477780058424707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028609800200870505
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49535698221178465,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022639532726268564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2848892339570606,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026387395366262536
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24077912408729735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023046951728657333
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24925086960246148,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002125288231383812
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4192693573282302,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002855212675860218
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35537258514643544,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025017841926330076
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3687908505277805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021558916218128877
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4743649010098781,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003080316334291657
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.40318795066110624,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027745110069664625
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4181866743293669,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002368198674279566
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..32c8220028e2c1d7356af90d92110d7192023607
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 4.571492146276437,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.06410871617063811
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.1908676146970543,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.001194006157470212
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4356617762292338,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0020338819023735448
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.2605612470084477,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013649714000006274
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.08067191193687241,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007328285267680664
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.19180765723962376,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00171245316056684
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.11129123255973318,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009537147017480764
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.15773820339990663,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0009130906909507374
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.36537444743215186,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.001890266179683196
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.21632110836430327,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001087079156733761
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.16951329717089128,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0011272646867077703
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3881088934505719,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0020860272608716683
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.2315858520913805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001327505047025221
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3b991e8cc5ad90e8d963e4b03ff8f8e67638799
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 10.539837668585216,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17465438135688363
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5480364269639086,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003312334506418998
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.40745113744298617,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002799036288361961
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.44123977635419337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002211756735853885
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.2555460664426593,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002740757842598831
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.18529601706006868,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0020284497579205283
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.20149328522490387,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0019532239658465902
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.40745757408421523,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003071919070780174
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.29894440995242827,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023043029752528896
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3250117513928856,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019969004221661773
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4504408531473228,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003267241907381879
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.332318789231438,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025568322527799218
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.3607660614467818,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002203780125553279
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..614db355301841e3419ac852baa4b371800c85fc
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 11.979737123111207,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1777216675487079
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5694774840044784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032690447343626645
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.43169795189062204,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027755841019906488
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.46621835136055,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021811745216102235
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.27578961505709343,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027563670362028268
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.20476632102377532,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00211282040156597
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.22144793601758048,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0019896499062567576
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4243204735629267,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003038279741971009
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3183922809822098,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023264664598331725
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3449621710063106,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002021110115853051
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4705230491379006,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032090612787953627
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3543590674460134,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002547140809474543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.3836388316327653,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002192238878763976
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f896d402858396b049de988b5844937862ee7314
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 12.462479849121463,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.09061529648631804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.58220595007106,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.00329263853652951
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4397725602332855,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002774975421382418
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.47693684077807014,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002210569646771097
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.2839931545986718,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00274726172380332
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.21123062905099813,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021777700695380237
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.22927798524401358,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002062255539404591
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.43127648354778575,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030397840353640227
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3232046086079628,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023417332544385812
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.35144747175107627,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002070073239316246
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4789262682132351,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032006625726612964
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3608549641566686,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025963317856745423
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.39176024632309325,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022580636053722825
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cec0f9a334531a6815fcd91bce308415631df39
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 12.721474775009176,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1231471401393544
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5882829470459832,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032579810236422493
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.43920151452158274,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002766526893491765
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.47944687065511643,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002226356134768176
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.2905552860238982,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028062355496161736
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.21355809012964339,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021815257564468844
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.23330038384366422,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020867339523630442
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.43922338563825464,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003062166327090343
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.32586336591450926,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023806352520646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.35638075600135893,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002117189037709182
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4865694268823983,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003236938827400275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.36252025368066076,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026170321708128956
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.39603605414957316,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023038968339561114
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8df0316a51d25d78e8be2cd3791171edcc86d755
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 12.578909654685171,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.08947366290256598
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5955748121041431,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032883465766855922
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.43546535746599335,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002731334226696082
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4799825375133372,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022136759502543388
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.2963639637421549,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028409667830761806
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.21277263188099743,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021803230244831907
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.2349236523331446,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021027603419611013
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.44379029665218356,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031157354991706576
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.323130840043402,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002427154840974425
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.35648107973364707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021733704043705015
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4932209061993962,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032875700607312756
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.36003608717718644,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002619980949391493
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.39699261454485735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002322759270632993
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f521335a66a4f078c588323ca8f123062e5343ab
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.09615296353924062,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0015634664664492218
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.2420926320977713,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003742160422644687
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.13605098361826837,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002138340789759129
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.014329381922408752,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0006598814426382717
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.0376524650639389,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0017573473002010546
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.020520962151658592,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0009410783718450363
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.07944997329667514,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0011619662946455182
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.20129914286300007,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002862368913468282
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.11259477615346435,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0015919918693647467
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.07747255863790849,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0012436400498643209
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.1971312521833301,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0031341971345006906
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.10992024258016028,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0017233150363981779
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.7154102266273981,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07442684379362219
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dd263514fbd18a1b40159b2923af5fc4a1cd8ce
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.10727843731122125,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0016429787642985747
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.22812552760950588,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0037592807959132097
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.1401897846455214,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002065563130684558
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.009366381220805652,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.000580611530069708
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.02227051513823528,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014598457069566306
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.012848193733260807,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0007993459523072702
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.07737127160035749,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0011488670401891248
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.16356799430911556,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002554947845779246
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.10061411067602495,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0013753187776155962
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.08597435637614392,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001284877552656849
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.18438119439843625,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0030689265824266213
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.11263425306203224,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0016337405096701044
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.5591102576081973,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.04726830548473858
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..87258309873283189c10d24dac091c92cbb220df
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.1353701115549322,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.002553526241523374
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.22433509698003423,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004121136445432196
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.15686903837773863,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0025131912347000135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.01735730166703164,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011030459333331683
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.030592710991937784,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0016233577292839108
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.020439860337785443,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011077341156553913
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.10015629558085037,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0018598757797987388
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.16404884592949798,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002797494501855951
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.11527800425354318,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0017225562433088898
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.10553654669038727,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0019001720096507816
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.17730755997688144,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003284544255799627
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.12292764498886767,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.001920929967654257
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.9148312975010592,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.03327013226559874
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e8083bea81137e370daf19f6bd02239cafe3b5e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.14823607772738917,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003044719039917616
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.22710294289015492,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004565515334844266
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.16475864035896656,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029505584419723704
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.024478360849588366,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0012845736132388852
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.04089886902879985,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00199146152038313
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.028016924033823828,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001342276519470403
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.1122664526417856,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0022829408313811425
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.17187055503947934,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034382777440656468
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.12440016864393556,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021747021220266897
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.11602521107341782,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0023229102349895366
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.1811469470518433,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0038206377447515404
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.12969554350306625,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002319691975532689
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 1.2528979414068826,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08376234817364027
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b599c6a6055aea56ee1e0c52d484496a0e8f66c5
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.04597223962710385,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0029601721330201006
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.059262066595310824,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038174584010459106
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.04591039911863491,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002791474184795997
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.007607457998339093,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0008488904393593581
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.011485459027641155,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0011814997676546965
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.008239456967253532,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0008266852843405412
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.03533300507198341,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0023173210243399443
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.04477820142586464,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028720813204613613
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.03480791338007262,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021042096800218883
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.03659752478918535,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0023802274165057088
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.04741613084877418,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0030892963972075855
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.03643953371509485,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0022107928188988784
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.2598593623766219,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.05321220151546882
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..de49978977a62656347c0c73700f2a7bcd44e3e7
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0012123554660875527
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 6.855686455852068e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 4.891129215213428e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.00013175230566534916,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 9.393304330315343e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0012123554660875527
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 6.855686455852068e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 4.891129215213428e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.00013175230566534916,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 9.393304330315343e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0012123554660875527
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 6.855686455852068e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 4.891129215213428e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.00013175230566534916,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 9.393304330315343e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..faa442dbd9cf02491819f624c164f8c92df1bbd2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.14392087254096253,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00191218773257099
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.3335510125155157,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00424066588589097
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.19753122388376124,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0024387420146840884
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.03293453791040819,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010849848050450634
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.0801087967883217,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002707529002010886
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.04574277927115908,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014867843734761154
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.1092495491611276,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.001447320489270279
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.25406306423955183,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003311772200910392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.15005960824064515,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018551892590200018
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.11488818638193536,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0016338031980487882
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.26820478620797344,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003770590397028133
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.15789500500907694,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0020981661003555896
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.7609629260492066,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07394573684560968
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd0f0e790b1dbb419540b5b4b468f1a1cf7e1b83
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.16788681333512273,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.002996673841122846
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2659066356647195,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004417630517149148
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.19054528836123152,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0027762385529554825
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.03216596990559947,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0014970450474531835
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.055429634265232036,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024126272098439643
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.037445432438935236,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015708375864807137
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.1268648821509307,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002343416430911939
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.2007501282338804,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034454252437585383
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.14365798797701282,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002152785247950343
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.13249025632053385,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002382334028787402
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.21378735588805484,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0038631534751242757
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.15133625112518032,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0022987189077342857
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.7208126835215467,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.12882874863062854
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e567fd3e577723755a4d87c84218aa3430244a9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.23190411064008373,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003928445720902683
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.27111252857924834,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0040695915462987675
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.2299166372281009,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0031451668265938455
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.054979050862230205,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0023263664060471064
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.06284480502767972,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002347005223009533
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.05345238966079452,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0020416410892504333
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.17596884203561058,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0032896946978595246
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.20395858533104694,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032166129006930004
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.1733526719843541,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025885431117299053
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.17981583590210734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003268601816031648
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.21287210159906927,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035404193682421043
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1786191806974414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002644651853081322
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 2.483152381487638,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.12567252097455675
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e30dc9da1dfc6c6e9c88686547cdc0c2397d881
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.23800276566402645,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004262627386705336
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2467737856414683,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004201225613127943
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.2263782642164461,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003565639023860395
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.054965936961357126,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002410881639820072
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.05596347742993157,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002284054310082863
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.05162608161498276,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0021219089599664137
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.18015571246200018,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0034901024101091738
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.18447874941589523,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032545586161932083
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.17010227183963283,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0028684700357903712
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.18347830366600554,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0034874495221723506
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.19105755973960079,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035164481426434046
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1743013875297768,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0029210490230557468
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 2.6449487563680383,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13528609048586093
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9770ca475e0ad432f2a0062a84df25228fd4953
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.06594749726663383,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004007317645309001
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.05829162428133866,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003473909680439278
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.056708763963365785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0032944667691835
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.014718326865253737,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016135099274249497
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.012961663415679227,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0012919513912192295
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.012692331817431131,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0012786487564727808
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.05107085537222431,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0032136006098534547
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.044208562337105364,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002676338653334541
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.04319815386278401,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025538782731808645
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.05196861792963686,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003240814781600669
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.04565880850659429,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002808420561698131
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.0442062674611244,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026002268489095344
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 0.1609104692223714,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.05303525426487922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec48a8e8dcdac5c615f9cd9fa18b11866dc6764c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.002686128768259734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007731754762039508
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.0019582859026840618,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005536575190801022
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.0022309537417988285,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0006289284051050972
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0002260595490340218,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00011315061618655328
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.0001350850407454181,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 6.834253022123397e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.0001681232813308285,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 8.442906183093734e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.0015604855435599053,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.000436200986701868
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.0011408291729870085,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0003164044579693306
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.0013014290599878409,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.000360060633092522
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.0018928183051379502,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0005458572053716986
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.0013677555797593326,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00038471438802225113
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.0015671971090940894,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0004425697142281073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 2.457981907406653e-43,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 1.397335680118716e-36
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5422fc5b9e88cf81cd8de0ec9b20ce69b1adb753
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1544803647320562,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001998179323003808
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.3617501123052072,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004401350843198141
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2132476859870225,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002596728123368581
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.034104422429782094,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011296363295058075
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.08341719921119035,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0027753920629418876
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04756359555453488,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015388801074832958
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.11047768692113931,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0014728181415206798
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.26038442217264174,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003368032246087583
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15260364524737308,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018980795169799797
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1235706334215309,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0016790133111005653
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.29156249557144304,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003855753079955984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17085907541578757,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00219601338601483
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7911957794729954,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1028992752886963
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2aa9f3998a44f32b13ec56debc55dc8adff3d331
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.18210955050574165,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0030336588185256538
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.30765438008746904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0045764351815007235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21212307020665455,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002805971174994527
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.03608017605336464,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0015351821810233588
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06530630731492977,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0026678483835833266
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04293890964191621,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016754133257735123
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.133576425244387,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0022712598584840454
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.2257975651465562,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035054216942714362
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15522658921799362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0020874537466411373
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.14117554206112032,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0023065404958774274
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.24343402256399355,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003941092857511189
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16565749532885793,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002253857246658309
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.9369867872208306,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13301743127426643
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f3847b288a0a2d37e0fc250fafe1b45e4b4628e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.2105718627026518,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0035778413560214016
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2986830414520048,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004329540379099283
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2277383324826566,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0030573712743888364
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04843737420278684,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002078874124454135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06777424154476913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002554380728108249
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.051547066687828734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019622251118453704
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.15888578947684742,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002954896445415003
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.22275234069768626,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003313554078249958
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.17058950590239796,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00245256877334995
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16407328548937136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0029472894981933152
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.23520189891677276,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037453526896800695
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17778653214796106,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0025471887620010904
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.133141577656285,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.12965820672946443
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..56fdc8e0520e2c7ded6051768f86e4bcb94ebdbe
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.21408207074509233,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003911771705487585
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.27616262040256995,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00445926401302726
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.22352993200471194,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034158533172611158
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04872713712902665,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0021207481086453045
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06294658987847104,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002453274077945568
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0505319645754636,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019929881863291095
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.159438518303562,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003102061741719361
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.20607630574913904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035265999447064503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16637492784414312,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027123155106531316
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16373162683207845,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0030891132218923796
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.2156279714811016,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003810201919065186
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17215854927610266,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00276757297974389
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.2845962769932853,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1118763427251602
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae3f2181c3f30d58a7e0776e04d221bd97a41218
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.06392907254960759,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003876201158464289
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.06750288591690386,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0039913615182408215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05885821408809552,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033400021306721392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.014573840849595909,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016332201756935874
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.01510184445883284,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014126505769374265
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.012855134170914497,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011864715171664183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.04895381713163094,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0030882253843767114
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0501669133523796,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002985189039759907
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04407299444650032,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025320491349861813
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05093801249895102,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0031687581333617715
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.05318442494362772,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00319392235102553
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04628126164174725,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002645018363815525
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.3124946765538571,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07230250867579978
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0d6a314006334d64bc576a0db54b68b081263ae
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.002395128003303927,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.000695243693352885
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.00195574424379908,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005505695407964535
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0020718557755462,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005803475030367974
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.00013792342491448747,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 7.95881099815707e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00011258133899643333,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 6.616712432800021e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0001226302404615478,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 7.106807371334149e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0016797182212470382,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00047877790197405523
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0013438433718934864,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00036757969239639373
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0014461673142424087,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0003992063154628714
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0017248567966735974,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0004849887493451295
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0014510474885315653,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0004115402839485218
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0015096956796575665,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00041391116095822775
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.0828965147492013e-39,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 2.1508906223493296e-34
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a1ea17defc9465c64d993866d8f1490e111e7d2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.13956776165068546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0019199393436886713
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.33339512043851244,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004403518880734348
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.19434908474166693,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002580674236992386
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.02954528480583196,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010278446226912197
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.07384906277566924,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0026006210708488927
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.04167140246132445,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014421807877901167
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.10296399819932696,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013531250614748109
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2481421743812119,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032958052192564938
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.14366448064057566,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018391132136240471
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.11110482915168116,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001566443275567478
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.2671737010930205,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003761177660698248
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.15494320648091148,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0021279657138113806
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.607536775722765,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08314365108296334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a5eeaf8018ffbca3243cd54ab2c011a4e28d6bd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.1634034882580627,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.002778000898524111
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.3297745025329579,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004476473426867562
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.2058050406751238,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0026719707719642534
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.034771289771502706,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0014069026543471145
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.07187730165529026,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002600700120998646
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.043558889183295355,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015410306979479628
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.12029005613595435,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0021151139292946894
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2433331878695519,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034049894620247537
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.15107763163746762,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0019496226216399687
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.12853278724838335,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0021998692763661926
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.26309292044402266,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003905125784954087
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.1625706073794353,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0021908461537671815
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.7147680841837472,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08856857483686215
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..de76e9dcd5d6313ddc510aba90f48cf0f8c12e32
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.2109214359178399,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0036944383537942287
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.31362251846066164,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004396355711271353
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.22985666485690312,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029722882685021155
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.04995066744359591,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0020522782013875124
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.07352603410863771,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0026284467349718616
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.0536753540466693,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019243134991576581
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.15911804861593265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0030530508121933645
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2349429958098137,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034108172816408387
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.17246176189956652,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002416319418728753
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.16524804859917733,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.00301812800842873
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.2498146184241616,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003847756166445163
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.18120098504252252,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002535120819891981
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.208731949110297,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.12846314186145602
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..13a9744be2280623d7fd4eff75ddab5d06ccd9fa
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.21988529529321785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00419402279091732
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.2943663747316836,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004684294497637764
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.22947123968727437,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003432595328640877
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.053900118661333415,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0022908140318405336
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.07108825092031434,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002629763702146578
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.055448723201269305,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002084868367528835
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.16335356522009134,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0033163399393172656
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.21772234335500532,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0036238704778221043
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.16982929076343528,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026892298793044153
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.16944832007925265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033011948856498242
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.23129127383235099,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004012907807026624
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.17807867601617003,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027835911872386137
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.3795755522326423,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09523488591207438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..848ab00c09b0fd332a1948285471dd57b18b89a6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.06406893263545496,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003961269793753737
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.06853782086108365,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004110519937921475
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.058377571953581675,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00330940736804228
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.014371718618784664,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0015372576661987745
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.016136547005254933,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0015536759212089171
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.013120867981094677,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0012122900136758608
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.04844168198127235,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0030911502808168654
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.050701248917136586,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030909279173779165
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.04335377999727481,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002489167957314197
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.05015315138408474,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003149051173293119
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.053513518304194346,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032937944931802
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.045303807045979784,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0025901263701828387
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 0.3681689900396952,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0877135242120339
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..92475f113197e18707c42ba5a9dcbf9a729d1894
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.0022196819899664693,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0006389555240065754
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.0017648362791656974,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0004854257314122628
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.0019298414574400779,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005424694841327019
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.00021367521367521368,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00010817206714968564
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.0001435514171363228,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 7.2233251401255e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.00017077154074693042,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 8.588075389535402e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.0016999247726071723,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00048794679278765595
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.0013511577706827674,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0003704228627915442
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.0014733221159742111,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00041177963084147186
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.0016999247726071723,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.00048794679278765595
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.0013511577706827674,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0003704228627915442
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.0014733221159742111,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00041177963084147186
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.3479106966936628e-40,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 7.024226621536986e-35
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9737e681cf03ac17477e575bf6dc8ca643a785b2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.15264482299030216,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018892287776596762
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.3634914761237878,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0042673103293693
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.21237515311232552,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0025066211781886727
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.03436173460976895,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010891793221324652
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.08578251655669292,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0027573284566819968
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04839302460393313,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015221033585233316
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.10959444729742314,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013289111469351577
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.26324151102462773,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032151197123248265
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.1527576047947492,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0017849830465469345
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.12201884539644926,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0015609231269051787
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.29254419732339304,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037093261325614685
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.17003724296721284,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002099393880724259
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.851658410571803,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0847044667439146
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d3d086b9705fc3a453f8e1f709f077e507b7880
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.15258459454398002,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0028523927205491026
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.27693243154947456,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004646942496742779
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.18283425417693044,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0027713338212500423
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.027349319827138365,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0014363650968716103
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.05286158605763855,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002448031923791781
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.0330637303888088,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014874261834413535
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.11403787794008956,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002131574593844777
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.20704397122540477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035190916342608437
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.13624007426933515,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002017206261906556
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.12015729412445958,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0021874344457937585
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.22214054465091698,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003980274533312435
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.14483517980421445,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002206068338853457
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.3833426601091705,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07290009233361687
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..efe8abe065bc5ca8f217006f10cd55ae368bc889
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.17538129287952656,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00314746642942028
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.3035380968702324,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004273062055225534
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.206890660644552,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0028805684860764903
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.036920195648714564,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016798438935385148
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.06315746254159717,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024967120331334
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.042917379641906755,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001729879440637286
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.13204571333099904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0025622160400850377
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.22767522500785403,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003313578716290039
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.1550000731348479,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002253269731731128
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.13759372211139387,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002589486006198034
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.24084923941771985,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003691447454191783
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.16271652676665874,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023840903049219004
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.7166760614351635,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07548868629100476
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ec658a5be32cc591d02c5071ff85064cd54fc7c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.1722314870590178,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0035575987944855476
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.2846924373146273,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004686130007796534
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.19792978032299605,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003191223312545367
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.0383730367297948,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0019206595568853901
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.06273993261989773,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002473625781893893
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04339127675794567,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001755983116522294
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.1296329568667989,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002873167572434554
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.21395972505616653,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003537281121092252
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.14829510892057157,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0024301211652535625
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.13501569230738897,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002923214874171359
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.2264806298976215,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003998852680019232
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.1557205369863269,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002608646494982156
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.8170891695141,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07365757852624
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d06fe6d692aca2e56fc1d89a9e5d80660176a33
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.04719180405255273,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0031100681012275313
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.06440377111919457,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003989373096614079
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.04912151385172474,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003015141144280244
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.010781829086988182,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.001247665158486873
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.014805196292695702,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014717021782107243
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.011144625832940378,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011272872776567269
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.03637888608887725,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002466167265372729
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.04889902154991311,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030204941398956834
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.03744254929445783,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002309310498043706
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.037736388154330454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0025300867714614945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.05122942299684587,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003191606648282039
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.03906436488072567,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024078009069653876
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.4440622247505757,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08944228394328604
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c4ebe7621f3c67b0e9d2e746ddf4b421d464657
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.004002287021154946,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0013363256771009757
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.0005892791785351531,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00019776344757350516
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.000996772652660801,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00032675508253975067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.003716409376786735,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0012418823853125054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.0005356771202161136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00017474324053640678
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.000906495501807682,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0002871332293725657
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.003716409376786735,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0012418823853125054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.0005356771202161136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00017474324053640678
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.000906495501807682,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0002871332293725657
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f72f91415ac30b74489688e7bdef634b9157f03
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 6.40989788407143,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.24713703635154027
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.0828887653019555,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0023077389471523784
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7500480520389616,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.005976873385040759
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.1383957433525216,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0030655249622799433
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.06794065951065742,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002233204911500119
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5926367717275536,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007639129170499543
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.11292457706725864,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003079977103975518
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.08220854090668749,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002300716890155535
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7449059048902814,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006047208011007572
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.13726205082911794,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0030563810579125766
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.08093935926897239,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002295609465381009
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7319720220794512,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00622298612963512
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.13502746066634788,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003057945370315767
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d56951400f104d80b50ea7a6557eb6b0e8cd275
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 65.74799325895958,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.0647696730988823
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7066231343001421,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006476416418191835
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6882071233310767,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006815625624428434
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6842447015845402,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0066922247839962664
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5611098444032797,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007818571899546059
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.552831719648591,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007981199503729296
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5495455894825296,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00787800597247414
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.691856821666126,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006672894747449168
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6764566471109283,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.007029690572886852
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6721572966876234,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006912976226852015
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.6948970991963076,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00663727200833744
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6783319092636114,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006985143468937649
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6741839052529897,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006866709595529206
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda2075e2b3868152f9a5ca0074d260338366f8f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 67.66856848794465,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.0374101515399263
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7264533852083246,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006162751580297105
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7079928962389903,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0065376714002391
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7063241591706693,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0063895506161702285
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.584378730022088,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0076034059725184405
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5746379343660041,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007784718789034897
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5731687972529929,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007679000694598775
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.713110776716986,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006379011368799703
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6969105774629994,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006761958628976886
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6951407672322222,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006624064867867759
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7155814215074481,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006353282702585826
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6985325886365854,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006722803585394176
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6968122674715109,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006584539233814862
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..587ba3e248bb5e3371fb6ffb9b06a7c45974534f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 69.28977694653214,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.1383689347747308
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7343815169425306,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006056443243618853
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7213620418894502,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006317010827359718
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7178841616364411,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0062143696022672204
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5967189945588884,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007450404459676049
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5886810928093721,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007604433786853759
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5864417429777042,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007504689248365126
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.721849768869843,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006265370504495436
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7107134184586044,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0065346021996612715
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7071359010285193,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006438542604564407
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7242948640713006,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006233954534836144
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7124045941531557,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006494187176286558
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7088299461511488,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006396597162497901
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..caed3f407c40ee37c87c0a430b816c571a64da9f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 70.23533344984375,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.9051054037488828
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7346785329679391,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0060241039486872055
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7292725498470866,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006197287192729478
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7233652492867584,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00613471446094234
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5968961483516068,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007470554111948398
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5947349139408712,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007577156941207958
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5905591389126227,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0075043884335545055
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7231778967509447,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006226778582598875
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7189070797560939,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006405887103113551
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7130555222674428,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006346247277373668
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.725163033308248,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00619102323663373
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7205115281184148,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006363858375725474
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7146446045868446,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0063049545258677085
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..175f74f4e83e5c8158a5a2f148d45062788e2e76
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 70.32747678674686,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.009178623036782
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7365451997720951,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.005959364204972434
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7321791188483596,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006130704848408115
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7257789808109173,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006064762057166745
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5988456423040933,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007405666798204553
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5975633970178799,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007525805689294777
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5928285347617148,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007450597694271025
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.725508588212507,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006169249161730822
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7223180805776619,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006355782042971553
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7159451295716375,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006290151298303866
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7273702460258058,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006130370372686494
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.723693210508481,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006312364180322713
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7173761682145219,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006247247805738648
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c481a7f994aae36a367e6c319b5c8509fbc2a8ef
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166526473007815
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166526473007815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a9f5d001b816ccd9fb598e0c269fda50af86007
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665817258899171
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665817258899171
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9d33f218e409d85fd51fd42d047aa9ef21a393d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5103373231773667,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663330673075898
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5103373231773667,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663330673075898
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..092bdb78ca60bac1e42cae802f3b49f37f4af2c3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5027203482045702,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665651503000718
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5027203482045702,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665651503000718
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4853c0243497c44aea0ec734432c1a93790be2dd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665713661738877
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665713661738877
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..59b60ea57dcccaa5449fbdd83a15359cb902a0e9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.4961915125136017,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665485744746795
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.4961915125136017,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665485744746795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..38c21e604d07de233a3e3f4f64fc1e98bf159a37
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.1430667201766189,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.009240564731257003
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.020963380772971652,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0005571764739275214
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.22474022783929637,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004232469770939408
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.03634213056601697,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008852037419284785
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.0034206314074165032,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00016377560466964583
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.03979709302516331,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0020218681983000908
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.005972148606985499,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00027694080927670884
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.018785915434537857,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00045961716766157023
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.2066360068752239,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0038201093018895315
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.032687390001550964,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007387342525395488
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.017178056045255737,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0004610204906653324
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.19189650357961507,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003798369392011233
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.02982601102364362,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007298100305106965
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..740408dc1708be850cacd9ebe1a288dda6d16277
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.3124984853443106,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.02657457812443979
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07320008291691311,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029027310550941185
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.19206179363741116,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004198403115580751
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07833603488907773,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024221956295178153
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.014935910160543195,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001066118611892239
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.035651065690208456,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001838136787740149
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.015687196908127872,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009548214688453516
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.061440768676818164,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0024509134853169003
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.16762193679347684,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0037347291719288735
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06582546671313708,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002008068318413744
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06328842881073514,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002545161737546467
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.16802715246598093,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00380229809853059
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.06732250510342716,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002101257193346951
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..96e96e94eaee3b7149bfd36f99ad3dd481a924b8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6472470961799496,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.04741660786692279
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07353029295593974,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003521754204664174
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.0858786615791722,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035032589854169945
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.06105404439408963,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026311682541945358
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.016706091366165337,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015315526778284171
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.0184312055942394,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00146285613399111
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.013649436315540416,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011294417436508105
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06272665005848953,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003073312683730802
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07420703288761338,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003078170412658722
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.05190903182793309,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022643442898557923
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06458457210352117,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031636187057443627
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07525324884676557,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031251840055039758
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.05329452859101479,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002333128047865669
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c556a90c76c65a493b542fb8c3167e316a03662e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6062635978384739,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0782482303422379
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07452134039693918,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003576611335222079
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.07615883857534307,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003364735282169584
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.06096787511489869,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026990107298849103
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.01697672999338089,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001565185650016919
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.016230779058456612,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013871281299543618
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.013471867168580705,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011304768221616576
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06522960654210525,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0032019612779513027
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.06648909991780103,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002980799399292851
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.05302910218785953,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002381323449986505
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06706238126084484,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032872562006614737
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.06804502241505245,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003049896418062481
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.054388195617974935,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002434407168355548
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..880cc3ffbd65e8e29c474d397face6f9ac4a9c93
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6185938082482176,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.07788119676580058
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.09281330184555696,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0040489108208186355
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.0821435584513187,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003365995853519246
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.0711834879764609,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0028766907517215213
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.02177748235883171,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001798003291922353
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.01795915149475838,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015049699206362978
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.01601647142341596,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001277963330949647
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.08028507288137521,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003552480741353349
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.0721305037000189,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003031967737705387
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06189574961323715,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0025620835394899715
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.08377438466476718,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003718032093123604
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07390575338066406,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003087807865518411
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.06398308391285433,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002632124175332435
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b08caf3b6d298ad9790e037db8e0be74e566ee9a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6938298171270286,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.06310426002073874
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.09754684198832757,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004080988763741584
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.08901016385362265,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0034796844878200567
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07652826047415558,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0029524964693573093
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.023058115240956877,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018391142815039204
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.019975557763278272,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016124179718119594
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.01743988575661535,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001341444432007256
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.08435632806664412,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0035957308634670844
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07836574159905216,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003158055668768904
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.066416662354599,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026142608788368494
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.0870783293964629,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003725904197191887
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07928068070866405,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031561546312338156
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.06800373888434776,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00266716520288408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9402d4fd0fed4fd119e11a05d2eec8a19196c09
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.4956474428726877,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166538214464238
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.4956474428726877,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166538214464238
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a495576b826be9d3872d3a04c09bfae678db92b3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.499455930359086,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665817258899177
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.499455930359086,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665817258899177
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fc0a2fb2735d1ac39795b08cf6016a9618ef825
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5108813928182807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663061261117746
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5108813928182807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663061261117746
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6df2beb6b1840cffb0611b05108f3b71d84c4cd6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5217627856365615,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011654768618560072
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5217627856365615,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011654768618560072
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..24e72e3ee9bf2e76f8cf896b5911ab16f3ce79e9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5119695321001088,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011662480968070071
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5119695321001088,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011662480968070071
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..454338c572a10ee523de6fcddb2f3cf62dae7bce
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665817258899171
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665817258899171
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..963f6d6f49af4e9607f52b0ea5782836f359f8ed
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5620239390642002,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011575720065594108
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5647442872687704,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01156760858875942
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e355944373c64c3dc3ecf96cefcb720c84883b4a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5565832426550599,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01159088337366686
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5516866158868335,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011603326108334509
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..05ad9373783da744219c5276307e9639d550cfa8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5652883569096845,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011565943814308853
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5631120783460283,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011572517929968272
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c0962f07d6fca1c68d95830a95d11c81ac9fb84
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5505984766050055,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01160593662415608
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01160722083798011
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fba286e80961323bf9749c4d99a8832009d76f79
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5522306855277476,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01160199979686681
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5538628944504896,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011597936590301236
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..76103b5af74e0ec3b3576b33cb50d1f422e96d12
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5576713819368879,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011587963545507183
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5565832426550599,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011590883373666858
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8242eeb929b5f5b5fe384c333d7d8832c0b78396
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.577,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01563058909047635
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.503,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015819015179246724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed67515bd48561027d641b939d508b9a9c9cff9a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.639,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015195720118175124
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.633,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015249378464171747
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9f89e3378e7047b4304e85a5d1af79936e1a623
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.643,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015158521721486774
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.647,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015120172605483697
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e0a27dae3e75e6d4eb63f6c38fb08e633df7879
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.645,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015139491543780532
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.66,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014987482264363937
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..64ed2a5a00edfa6ae49d827b755ed98050897268
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.643,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015158521721486773
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.655,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015039986742055237
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..41482b445c40c2be1ef84ea78790689603f3a6b0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.644,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015149042659306626
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.659,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014998131348402707
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6abbb9351b7086ced86bca2c357956e7fa5bafbb
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.832,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01182860583145426
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.757,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.013569640199177458
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..db16ea641eb41acec15be2b3e29fe782a2a3a575
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.884,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.010131468138756995
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.875,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.010463483381956722
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d91fc9fa08ede60dc3ff7f250f0a44731f32a3d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.896,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009658016218524286
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.897,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009616833339695803
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5959b5c90639fabbe55c572166168b48eec1e190
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.893,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009779910359847165
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.901,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009449248027662758
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..98ff9778be9d1dcaed4e4dbacc9f3cddafec9402
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.904,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009320454434783227
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.904,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009320454434783243
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..efb8b03a42f50ac4dd7f25a6ca6f6edd69f4c52c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Direct-Question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.906,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009233052000787745
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.913,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008916866630745873
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b89721a18b8101645aee3300c5f1e741c246b688
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.317,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01472167543888022
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.337,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014955087918653603
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d8e800cf18acd4d1c7c729be12958676fc482a3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.346,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015050266127564443
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.348,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015070604603768408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f5e9546d91f2e5f6f9aaafd87187f4d36400a56
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.361,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01519572011817512
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.365,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.0152317762262649
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..131618f32d020c2539c5f9d6fee3c3f8f1e87c49
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.367,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015249378464171749
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.376,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015325105508898125
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c233b2ed6e50c09f40c2cdb58fceae6797959169
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.365,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015231776226264893
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.379,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015349091002225347
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4528dd95a4508338b216007253763e8d5a83d4b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.357,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015158521721486762
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.373,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015300493622922805
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6954613c1b93f315a0054c6a50128119cb0ce55
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.309,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014619600977206486
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.321,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014770821817934642
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..64245e178660a9cba0693add6c68767dbd4ff675
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.35,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015090650341444231
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.343,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015019206922356951
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c57d6f04182537801fb59b6b7e68d22b685ea88
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.38,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015356947477797585
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.397,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015480007449307987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8a8cea72b8d7cde8ab570b5f11a126b71264c5a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.417,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015599819048769618
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.407,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015543249100255545
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..73f1216a8177a6645b0ef365d82c7487e6e7bd38
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.4,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015499685165842589
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.408,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015549205052920675
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc07fea2a33df2b4440f099fd548f83ebe999e1a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.368,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015258073561521805
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.388,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015417317979911072
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..876f4a2779aebc7d817fe67ddc063082a83eece6
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.32,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014758652303574886
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.333,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1d0cbb684581ef2f26c152521e71ed4de428fd2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.324,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014806864733738863
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.342,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015008706182121731
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6938d450503d8e02e8974161fbba4e8f66154324
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.321,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01477082181793464
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.328,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014853842487270334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6147e07fa4bf41be18f600c9e0dc197bd2c25151
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.323,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014794927843348639
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.326,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014830507204541033
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec204b71d7a3cd319383bf4643a016fbb6381591
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.331,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014888272588203934
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.342,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015008706182121728
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5452a84aed5e22f57b7c7faf84b06ec7cb4c63e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.307,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014593284892852634
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.326,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014830507204541024
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee1a390627b3c3d8915eb2aec8f22550abf0fcec
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.5093532870122929,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011560409019420364
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.5243185462319615,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011548748301487312
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7a8dab60912d965faa1f977d21f5c94941995ce
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4692677712453234,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011540570846495547
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.48476750400855156,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01155706536834828
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d74d5c94b99313f4e370ba144426105f9d5a2e0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4569748797434527,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011519544865928065
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.47835382148583644,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011551591851683337
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f11adf9d7d38dfeb22fc5e280a903c7e76a59318
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4596472474612507,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011524715486240652
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.46766435061464456,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011538227692217271
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3a2b4dad68cafc171d58e0816285b8bda9e1978
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.46285408872260825,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011530479981182624
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4665954035275254,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011536599118298163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca4034f4143dde459792429aad23eff72f2ffc14
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.45911277391769106,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011523708060182089
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4633885622661678,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153139408454962
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..46b643a55c7065b22c3617928c691a90820d3f1f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.518439337252806,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011554566910658103
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.5360769641902726,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153229486915312
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5a86611b14e6eebb1350547de8a1f55130e06c9
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.484233030464992,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011556682042196382
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.504008551576697,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011562060664045727
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0a099b9199d0ec05a431805e76f769ff60d7949
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4740780331373597,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011546883081384896
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4927846071619455,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561228264646724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..123fed1d8657e8dabfd35c88e7f6505a40edf80a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.47728487439871725,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011550494192008943
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4735435595938001,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01154623481377739
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..65459ee80811f81beab56dc97ec15808105dfa33
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4735435595938001,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011546234813777397
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.47888829502939606,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011552120807053822
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3de554e5a7a2f53e410b0731860fb43f3605aba2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4665954035275254,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011536599118298168
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4681988241582042,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011539022035111222
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bda636daf387636552deb5b1c495ec503c2a634
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6e4be259f000790ed370b6231ccb8aaec602a5b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc952f5ff3cf145135fab6e6431ee7c08dcb126f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c816ae10a1b49ae3e3e6a8df783e26f4c86d4667
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e24fd89c8331c2dce598f63054fe306dc3aad87
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ff31b3da2878ffdaa59c9ea4463ad835b65e488
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..66c5d9e926607ebf587a43c0abcc11a563b1a31a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.498663816141101,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011562390964658758
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.5259219668626403,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011546883081384905
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..db91e142db62cbabe6dbf74c894a4d6f88c345e1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.46766435061464456,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011538227692217271
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4949225013361839,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561836054238776
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..760ccca591d37fb63e23a477f0d65262c53255d2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.467129877071085,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011537420054210294
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4746125066809193,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011547518083754583
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee1d6c11863465cd85b715dfe2d28f0f5f10fe65
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.45537145911277394,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011516282203726655
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011534056494505864
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..800784555fc350945886a28f8813b3e769f03540
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.45537145911277394,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011516282203726655
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4564404061998931,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011518470676766505
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c73ecc354988e3b63ed714c67e9c7b6a99840cde
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.46285408872260825,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011530479981182626
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4607161945483699,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011526690316014585
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5366edb87d0a67cc8bf0d5de71c3dbffde3bcd2b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.5061464457509354,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011561558589040751
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.5339390700160342,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011535764881641411
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..76a32fc448fa7e3307a7e164b2b5210df36af4ca
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4879743452699091,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011559087533800682
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.5045430251202565,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561954965856519
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c5b5f2d508ac20dbfc84541999d60301cc3919b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.48102618920363444,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011554104174019692
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4831640833778728,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011555875693960778
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6720cf16ffd4e73fb9f79d5073c4cac86230619f
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.47621592731159806,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011549343521088362
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4681988241582042,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153902203511122
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2eaf3b2e45cc7142429141209719b58ac4eda111
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.47247461250668094,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011544898473864588
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.47621592731159806,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011549343521088362
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7140d7357e87438c461737a20f065b470108b4cf
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.47140566541956175,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011543509045585203
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4708711918760021,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011542794417345716
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eea9863c367dd7f7c2388f45ba1bcd842c8f1245
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976633
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..140df56c047dcfbc7a051c7d417daa685ba7c444
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030025579819366422
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317177
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..749bab170597df30b015aad479fae9c1b754e0ec
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..12e6bb873defe925ff1e23d406a4f0dfbe2548d2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030063300411902652
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030039730592197812
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e65beb600f0f9ac213a94bd5c0a7d03567466132
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03006330041190266
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030039730592197812
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab4063cc0e833128f5ac44657d36e6749b4945c1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03003973059219781
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f7fca8a4f4fc2f693505e954d76c6a3b21778bf
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..78249279158f6d6a05bd2612868fae135a50ce6d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..552dc6c0d1304a9fb859f384acf8214da625a348
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..67961e51d2160d8710ba3a724ccff1d775a7cd0c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030063300411902652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8c3766bd823f6d594ab25bf125e89a0cf924737
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317177
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3afa1bd339ebb7537b1df22b2e9442d3248a371
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..38fe52d09f0e159a7edcc62f17614cf07d85066a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..15d87db27980ad6ee44375f0154b61e1741572d2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a719614a62815261c4ed3d1c612a9d0c587f5735
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.48014440433212996,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.0300727231673172
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..759980106f8e9cc9bcd8852cc618f578628bd35b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5090252707581228,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..61fdcdabd54c1b34d4070f1503fd06ab1cf0b2d3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d916ebe475e2e6f6d87f20372a002bf80b5225e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..84fd6d467b079c5a01823bf8a537618f4d44b6df
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030039730592197812
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c11d39985bf597aae0161d9c2067a4bae720e39e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7680891e6bc9cf1f5d75f95265f68764e7145284
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976626
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976633
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..88f8c269120963f2c2275828909a12073bb62a0a
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317177
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030025579819366422
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a59c214b10925792eb98e986a1c824238086e8e
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5451263537906137,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029973636495415252
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03003973059219781
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ed5cdc116951f1c7e35e4309b5fda0a3825b175
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317177
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317184
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..845afe99d243ae3c7c78b5eb02b929ad741ce158
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c61ffe62aa46f480e819c9be576b1c28fdfd07bd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b4ac2159e2f4f1304480440bd2c15035a1604f3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331327
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e89bcb05934659ea5712607a160a247942c2c2cd
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..112cd418c51badd3f15f4d6423a0a4e7fc867c00
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317177
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030063300411902652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0eb7529e5142172bd2590d63d2b61c10fa2320f3
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5090252707581228,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976633
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b4c2f81ad9e80b732e75cb1131b1838ad18847c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5193370165745856,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014041972733712977
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616441
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f037c5f49992ca40703a5781c78c86f323d04456
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978594
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783658
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb7b3bcd132152a9f470d2edb05faf53f1f50a8d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051500838485807
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047718393997663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e569ce91308ef4d2d7b4e9d7122c46d4bd43e2b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978596
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.505130228887135,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01405174596179051
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..65e0f5319bb12b72ed33eab6b0838c7fa3553de4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.4988161010260458,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052446290529019
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.49013417521704816,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049749833367589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..aedfe09e41d7b5190c60466ef162f5ff28877a65
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_Replace_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5177584846093133,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014043619596174962
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049512
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bf091b1e268cc923b16cd7ba466a4e18d0fb58d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076896
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4940805051302289,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051500838485807
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..543153e48884a301e276dcea0f9bb26a839bacd2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4980268350434096,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052376259225632
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.48539857932123126,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014046492383275834
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e14b3a23e2fb385145a438299664773b38c3a35
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052376259225629
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4846093133385951,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783656
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..51d92c7a27c14e5fb1e503886780f93b658832c2
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5146014206787688,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014046492383275832
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045126130978601
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca9d83b6b61e57d26ba8d178ad168e46685dc95b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049294536290396
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014048278820405621
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..21b65bd33e7b28e9c545cdd382b21bf0aa5234c7
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_True-or-False_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052131146915864
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.494869771112865,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051745961790516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..433de03bdff7b0dcc5bb50760119a523c986c1b1
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.489344909234412,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140492945362904
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4846093133385951,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2d4c70b44d8552678017895bd37f6fd2b48a5f8
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052376259225632
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529012
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..58ad50d34b8a74e0f010e880a3221cbbf4fefd19
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.489344909234412,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140492945362904
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.47987371744277824,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014041096664344327
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9c93c569524decba2d1924476388844a4a56254
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.4980268350434096,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052376259225636
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.48303078137332284,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014044390401612972
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d387a3fbda34f0742a7a53754990f1f6e6ddfb36
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.48224151539068666,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404361959617496
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.48697711128650356,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047718393997667
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f92285cd1dfd066f3112bb1998c66491b5f32605
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616452
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4940805051302289,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051500838485807
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..16fb573fd4a46451a3b37be70450c943ac862668
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5138121546961326,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047122916440415
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.4972375690607735,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..696e3172d9d79fa5657df34e3e11634106962721
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052481306049512
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047718393997667
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..573ab11be333c7e84d1a8c13e666a94297f3003d
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5185477505919495,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014042813708888378
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5138121546961326,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047122916440422
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..89d708cba051f7a727819fa3d91570f18c30f840
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405090552122858
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049294536290396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f026d0be0c407fdf232cd54f14f4c870bf8d458
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5217048145224941,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01403923921648463
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5114443567482242,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014048804199859325
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d46e56a6ec941fd34808fda7b2ae255d481639b
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_stand-for_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051220692330349
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052131146915852
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_0.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f446532c62552a1b841fbe7afcdca4ad2c48865
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.4861878453038674,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047122916440415
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.49329123914759276,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_1.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa4990edd8f5b4f6361e101becbf72f8f0e2a8b0
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050905521228577
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.4964483030781373,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052131146915853
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_2.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e82dab6b04a45d2061b8c866eba7508e741cc5c
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.4964483030781373,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052131146915867
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.500394632991318,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_3.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..216f636be982b57a79b0ba7e880662c34306ae24
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5272296764009471,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014031631629827703
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783672
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_4.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6e0edd603b7184cb6f1e616dc431b70f58693e4
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978601
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051956064076903
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_5.json b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..effe81c5782e1aecb35ab6939b6c0ec373913491
--- /dev/null
+++ b/4b284b12boscar/eval/slim.4b284b12boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045826789783666
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5138121546961326,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047122916440419
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/merged.csv b/4b284b12boscar/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3937d38e3d0448030a2db6020afd01b5dfb28120
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/merged.csv
@@ -0,0 +1,53 @@
+dataset,fewshots,prompt,metric,value
+e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.054594037237788585
+e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.054594037237788585
+e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2086255432248638
+e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2086255432248638
+e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.234132554236615
+e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.234132554236615
+e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24473795777586516
+e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.24473795777586516
+e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2467411297660491
+e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2467411297660491
+e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24925086960246148
+e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24925086960246148
+e2e_nlg_cleaned,5,average,multiple,0.20634701530727384
+gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04756359555453488
+gem_xsum,0,median,rouge2_fmeasure,0.04756359555453488
+gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04293890964191621
+gem_xsum,1,median,rouge2_fmeasure,0.04293890964191621
+gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.051547066687828734
+gem_xsum,2,median,rouge2_fmeasure,0.051547066687828734
+gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.0505319645754636
+gem_xsum,3,median,rouge2_fmeasure,0.0505319645754636
+gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012855134170914497
+gem_xsum,4,median,rouge2_fmeasure,0.012855134170914497
+gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0001226302404615478
+gem_xsum,5,median,rouge2_fmeasure,0.0001226302404615478
+gem_xsum,5,average,multiple,0.03425988347851991
+web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05146498360167007
+web_nlg_en,0,median,rouge2_fmeasure,0.05146498360167007
+web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07885240324677927
+web_nlg_en,1,median,rouge2_fmeasure,0.07885240324677927
+web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10271084479951895
+web_nlg_en,2,median,rouge2_fmeasure,0.10271084479951895
+web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10810375284373198
+web_nlg_en,3,median,rouge2_fmeasure,0.10810375284373198
+web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.1144786865204847
+web_nlg_en,4,median,rouge2_fmeasure,0.1144786865204847
+web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11914106590715341
+web_nlg_en,5,median,rouge2_fmeasure,0.11914106590715341
+web_nlg_en,5,average,multiple,0.09579195615322306
+wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03000716115226249
+wiki_lingua_en,0,median,rouge2_fmeasure,0.03000716115226249
+wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04395688090556475
+wiki_lingua_en,1,median,rouge2_fmeasure,0.04395688090556475
+wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06557325455887651
+wiki_lingua_en,2,median,rouge2_fmeasure,0.06557325455887651
+wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.055968236855402816
+wiki_lingua_en,3,median,rouge2_fmeasure,0.055968236855402816
+wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.019691124849550927
+wiki_lingua_en,4,median,rouge2_fmeasure,0.019691124849550927
+wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0029524028339129186
+wiki_lingua_en,5,median,rouge2_fmeasure,0.0029524028339129186
+wiki_lingua_en,5,average,multiple,0.03635817685926174
diff --git a/4b284b12boscar/evaluation/generation/merged.json b/4b284b12boscar/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a08f0a6e56a19b0ef45de2045fbcb1bde2b72c4
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4358876645741479, "bleu_stderr": 0.04475556276641748, "rouge1_fmeasure": 0.11084365567127677, "rouge1_fmeasure_stderr": 0.0022963454664604076, "rouge1_precision": 0.08601617848273047, "rouge1_precision_stderr": 0.003027222218230607, "rouge1_recall": 0.29226306498620885, "rouge1_recall_stderr": 0.005344776046578116, "rouge2_fmeasure": 0.05146498360167007, "rouge2_fmeasure_stderr": 0.0014277295107541841, "rouge2_precision": 0.04041394987853437, "rouge2_precision_stderr": 0.002132583963194437, "rouge2_recall": 0.1415225530229299, "rouge2_recall_stderr": 0.0034803378779995004, "rougeL_fmeasure": 0.10611300272902285, "rougeL_fmeasure_stderr": 0.002132214334314073, "rougeL_precision": 0.08239970522779808, "rougeL_precision_stderr": 0.0029176133842350504, "rougeL_recall": 0.2827601971319833, "rougeL_recall_stderr": 0.00519215142624447, "rougeLsum_fmeasure": 0.10445493190549919, "rougeLsum_fmeasure_stderr": 0.00215267434014802, "rougeLsum_precision": 0.0815748628057772, "rougeLsum_precision_stderr": 0.002939098844173709, "rougeLsum_recall": 0.2756279858381666, "rougeLsum_recall_stderr": 0.0049864009961195395}}, "1": {"PALM_prompt": {"bleu": 0.6458190048668558, "bleu_stderr": 0.03274318464461181, "rouge1_fmeasure": 0.16170102806244036, "rouge1_fmeasure_stderr": 0.0036142070180597507, "rouge1_precision": 0.13985160542322855, "rouge1_precision_stderr": 0.004374312136397895, "rouge1_recall": 0.32007070778063884, "rouge1_recall_stderr": 0.004962532451445636, "rouge2_fmeasure": 0.07885240324677927, "rouge2_fmeasure_stderr": 0.0023523131620574156, "rouge2_precision": 0.0694179465016437, "rouge2_precision_stderr": 0.002921286815952664, "rouge2_recall": 0.15883761787677303, "rouge2_recall_stderr": 0.003480119679954502, "rougeL_fmeasure": 0.14742576479874345, "rougeL_fmeasure_stderr": 0.0031238502419319854, "rougeL_precision": 0.12591553956133228, "rougeL_precision_stderr": 0.003854734977633317, "rougeL_recall": 0.30061625652088586, "rougeL_recall_stderr": 0.004630766957074663, "rougeLsum_fmeasure": 0.1497000837635611, "rougeLsum_fmeasure_stderr": 0.0032201737172114866, "rougeLsum_precision": 0.1283694931084644, "rougeLsum_precision_stderr": 0.003947705918972332, "rougeLsum_recall": 0.3021994113995889, "rougeLsum_recall_stderr": 0.004647780404212457}}, "2": {"PALM_prompt": {"bleu": 0.7970016196921748, "bleu_stderr": 0.04581096081746233, "rouge1_fmeasure": 0.19703997895188197, "rouge1_fmeasure_stderr": 0.0042195368281687335, "rouge1_precision": 0.18059337363344904, "rouge1_precision_stderr": 0.005339355314623877, "rouge1_recall": 0.3595991748021704, "rouge1_recall_stderr": 0.004980869877485108, "rouge2_fmeasure": 0.10271084479951895, "rouge2_fmeasure_stderr": 0.0029196673345892295, "rouge2_precision": 0.09688101950971822, "rouge2_precision_stderr": 0.0036313101633365223, "rouge2_recall": 0.18755499121707667, "rouge2_recall_stderr": 0.00376908816872013, "rougeL_fmeasure": 0.1772530360597836, "rougeL_fmeasure_stderr": 0.0035959119026034024, "rougeL_precision": 0.15982668252407492, "rougeL_precision_stderr": 0.004603772659928646, "rougeL_recall": 0.3351391580099154, "rougeL_recall_stderr": 0.004585332134390738, "rougeLsum_fmeasure": 0.18084941775566396, "rougeLsum_fmeasure_stderr": 0.0037099919587234576, "rougeLsum_precision": 0.16383301190827276, "rougeLsum_precision_stderr": 0.004742860098171715, "rougeLsum_recall": 0.33868792774850437, "rougeLsum_recall_stderr": 0.004634428210263519}}, "3": {"PALM_prompt": {"bleu": 0.9098079216712224, "bleu_stderr": 0.049900879662925314, "rouge1_fmeasure": 0.20522389412159425, "rouge1_fmeasure_stderr": 0.0043615630325957435, "rouge1_precision": 0.19212508744736018, "rouge1_precision_stderr": 0.005617174437077427, "rouge1_recall": 0.3703600547581712, "rouge1_recall_stderr": 0.004938819064876214, "rouge2_fmeasure": 0.10810375284373198, "rouge2_fmeasure_stderr": 0.003075123686799679, "rouge2_precision": 0.10484577205978293, "rouge2_precision_stderr": 0.003926715523602227, "rouge2_recall": 0.19383710059664028, "rouge2_recall_stderr": 0.003742953011901514, "rougeL_fmeasure": 0.18478247984025434, "rougeL_fmeasure_stderr": 0.003743746721954891, "rougeL_precision": 0.17083343128882605, "rougeL_precision_stderr": 0.004911086258050898, "rougeL_recall": 0.34476103034505623, "rougeL_recall_stderr": 0.004533133149997157, "rougeLsum_fmeasure": 0.18830268691133512, "rougeLsum_fmeasure_stderr": 0.003857827801954891, "rougeLsum_precision": 0.17532235438945584, "rougeLsum_precision_stderr": 0.005084911003429309, "rougeLsum_recall": 0.3483193020595072, "rougeLsum_recall_stderr": 0.0045873162051077825}}, "4": {"PALM_prompt": {"bleu": 1.052600188336286, "bleu_stderr": 0.06665675598684426, "rouge1_fmeasure": 0.21317352472996992, "rouge1_fmeasure_stderr": 0.004401402126159575, "rouge1_precision": 0.20108020030540724, "rouge1_precision_stderr": 0.005644545699403245, "rouge1_recall": 0.3818834779332785, "rouge1_recall_stderr": 0.004965307876320074, "rouge2_fmeasure": 0.1144786865204847, "rouge2_fmeasure_stderr": 0.003126094675218737, "rouge2_precision": 0.11098039808305214, "rouge2_precision_stderr": 0.003927637491564618, "rouge2_recall": 0.204923577311545, "rouge2_recall_stderr": 0.003948475544315969, "rougeL_fmeasure": 0.19087293738824496, "rougeL_fmeasure_stderr": 0.0037489593597534975, "rougeL_precision": 0.1772302061036221, "rougeL_precision_stderr": 0.0048597986989461165, "rougeL_recall": 0.35524087803093607, "rougeL_recall_stderr": 0.004631433081108199, "rougeLsum_fmeasure": 0.19652423173229852, "rougeLsum_fmeasure_stderr": 0.003927577783905062, "rougeLsum_precision": 0.183821519223083, "rougeLsum_precision_stderr": 0.005085692199041701, "rougeLsum_recall": 0.36063124635272004, "rougeLsum_recall_stderr": 0.004673788271599251}}, "5": {"PALM_prompt": {"bleu": 1.0960931080747824, "bleu_stderr": 0.06344656593457922, "rouge1_fmeasure": 0.22442306072164447, "rouge1_fmeasure_stderr": 0.004490024810310396, "rouge1_precision": 0.21325470731107665, "rouge1_precision_stderr": 0.005803174587983953, "rouge1_recall": 0.3950162351216541, "rouge1_recall_stderr": 0.004977496708782323, "rouge2_fmeasure": 0.11914106590715341, "rouge2_fmeasure_stderr": 0.003087212545803013, "rouge2_precision": 0.11715823165872309, "rouge2_precision_stderr": 0.004000375023616823, "rouge2_recall": 0.2112966644324947, "rouge2_recall_stderr": 0.003874637116812206, "rougeL_fmeasure": 0.19921393354497344, "rougeL_fmeasure_stderr": 0.0037728930475513494, "rougeL_precision": 0.1866801247482292, "rougeL_precision_stderr": 0.004986038924327078, "rougeL_recall": 0.36529275376302506, "rougeL_recall_stderr": 0.0045911093135183, "rougeLsum_fmeasure": 0.20449901465956502, "rougeLsum_fmeasure_stderr": 0.003922872033805495, "rougeLsum_precision": 0.1931918251374917, "rougeLsum_precision_stderr": 0.00519492169870331, "rougeLsum_recall": 0.3699764207220656, "rougeLsum_recall_stderr": 0.004618932928096925}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.278917009551764, "bleu_stderr": 0.09140957868815724, "rouge1_fmeasure": 0.12988512607981967, "rouge1_fmeasure_stderr": 0.0024439343174664525, "rouge1_precision": 0.122915905754568, "rouge1_precision_stderr": 0.0027293510115185423, "rouge1_recall": 0.1767818917839268, "rouge1_recall_stderr": 0.0033841479310980638, "rouge2_fmeasure": 0.03000716115226249, "rouge2_fmeasure_stderr": 0.0009354733970545372, "rouge2_precision": 0.026806616635118036, "rouge2_precision_stderr": 0.0008667140586963389, "rouge2_recall": 0.04199675102883295, "rouge2_recall_stderr": 0.0014222033630357748, "rougeL_fmeasure": 0.09879731353636668, "rougeL_fmeasure_stderr": 0.001799103726263371, "rougeL_precision": 0.0938740707460975, "rougeL_precision_stderr": 0.002177005796825213, "rougeL_recall": 0.1374323483251746, "rougeL_recall_stderr": 0.00266526479478189, "rougeLsum_fmeasure": 0.12094018141973355, "rougeLsum_fmeasure_stderr": 0.002284523760450248, "rougeLsum_precision": 0.11485464277414363, "rougeLsum_precision_stderr": 0.002596139008897079, "rougeLsum_recall": 0.16449395724493923, "rougeLsum_recall_stderr": 0.0031654167638919778}}, "1": {"tldr_en": {"bleu": 2.707464673902901, "bleu_stderr": 0.1122035248088363, "rouge1_fmeasure": 0.18879633772816867, "rouge1_fmeasure_stderr": 0.002216973861149929, "rouge1_precision": 0.2321033086693269, "rouge1_precision_stderr": 0.003444990523939608, "rouge1_recall": 0.21218959051209052, "rouge1_recall_stderr": 0.0028780222957239345, "rouge2_fmeasure": 0.04395688090556475, "rouge2_fmeasure_stderr": 0.0012580561387008646, "rouge2_precision": 0.05986007346081548, "rouge2_precision_stderr": 0.0021311948977268975, "rouge2_recall": 0.048915032760613236, "rouge2_recall_stderr": 0.0014864819960528294, "rougeL_fmeasure": 0.14456965085181128, "rougeL_fmeasure_stderr": 0.0017115993134306947, "rougeL_precision": 0.18107984090374726, "rougeL_precision_stderr": 0.0028960619334779066, "rougeL_recall": 0.162566462675585, "rougeL_recall_stderr": 0.002220609266556534, "rougeLsum_fmeasure": 0.1776723900847922, "rougeLsum_fmeasure_stderr": 0.0020872816698884362, "rougeLsum_precision": 0.2192803166896013, "rougeLsum_precision_stderr": 0.0033059632616290542, "rougeLsum_recall": 0.19959677205987747, "rougeLsum_recall_stderr": 0.002694200019026103}}, "2": {"tldr_en": {"bleu": 3.7904798767943335, "bleu_stderr": 0.11122294896753164, "rouge1_fmeasure": 0.23558432539564403, "rouge1_fmeasure_stderr": 0.0021996425715538554, "rouge1_precision": 0.3147030364809968, "rouge1_precision_stderr": 0.0037689305738243476, "rouge1_recall": 0.25097433153568405, "rouge1_recall_stderr": 0.0028925947424101703, "rouge2_fmeasure": 0.06557325455887651, "rouge2_fmeasure_stderr": 0.0013735424734152642, "rouge2_precision": 0.09338720402263002, "rouge2_precision_stderr": 0.002368756352904399, "rouge2_recall": 0.06942686553346078, "rouge2_recall_stderr": 0.0016202813479301297, "rougeL_fmeasure": 0.17976921379978278, "rougeL_fmeasure_stderr": 0.001722115310366516, "rougeL_precision": 0.244935161158131, "rougeL_precision_stderr": 0.003198495702253031, "rougeL_recall": 0.1911576784626261, "rougeL_recall_stderr": 0.002276055119508395, "rougeLsum_fmeasure": 0.22099937310878076, "rougeLsum_fmeasure_stderr": 0.002075646266476727, "rougeLsum_precision": 0.2958662208042827, "rougeLsum_precision_stderr": 0.003600934016250172, "rougeLsum_recall": 0.23572659332760645, "rougeLsum_recall_stderr": 0.002741901482334439}}, "3": {"tldr_en": {"bleu": 2.8632324313744557, "bleu_stderr": 0.09299178906804655, "rouge1_fmeasure": 0.19838663921126778, "rouge1_fmeasure_stderr": 0.002621759128746396, "rouge1_precision": 0.2754936136179268, "rouge1_precision_stderr": 0.004157379347035767, "rouge1_recall": 0.20528413095627832, "rouge1_recall_stderr": 0.003149288824376793, "rouge2_fmeasure": 0.055968236855402816, "rouge2_fmeasure_stderr": 0.0014034751658766104, "rouge2_precision": 0.08256853694746276, "rouge2_precision_stderr": 0.0024364087557486665, "rouge2_recall": 0.05727334409426975, "rouge2_recall_stderr": 0.0015675844642289158, "rougeL_fmeasure": 0.15243563853997083, "rougeL_fmeasure_stderr": 0.0020406692721428466, "rougeL_precision": 0.21625894747073696, "rougeL_precision_stderr": 0.0034653747339702624, "rougeL_recall": 0.15717871923447801, "rougeL_recall_stderr": 0.00246051792291125, "rougeLsum_fmeasure": 0.18589041350629654, "rougeLsum_fmeasure_stderr": 0.0024658633555113495, "rougeLsum_precision": 0.2591405389486435, "rougeLsum_precision_stderr": 0.003962509253114959, "rougeLsum_recall": 0.19231398442318168, "rougeLsum_recall_stderr": 0.0029662256128315167}}, "4": {"tldr_en": {"bleu": 0.08259030504562871, "bleu_stderr": 0.0071288759428394894, "rouge1_fmeasure": 0.06581475266895565, "rouge1_fmeasure_stderr": 0.002308785748398076, "rouge1_precision": 0.09597320792153687, "rouge1_precision_stderr": 0.003523744199921863, "rouge1_recall": 0.06743171611131621, "rouge1_recall_stderr": 0.0025672971207144984, "rouge2_fmeasure": 0.019691124849550927, "rouge2_fmeasure_stderr": 0.001032704031652877, "rouge2_precision": 0.03073553158490422, "rouge2_precision_stderr": 0.0018205157024659981, "rouge2_recall": 0.02032186815564804, "rouge2_recall_stderr": 0.0011741990653970874, "rougeL_fmeasure": 0.05209993435974213, "rougeL_fmeasure_stderr": 0.0018508808521006768, "rougeL_precision": 0.07767834454496185, "rougeL_precision_stderr": 0.002968257043282438, "rougeL_recall": 0.05327838136711029, "rougeL_recall_stderr": 0.0020669018184854763, "rougeLsum_fmeasure": 0.06145306117564354, "rougeLsum_fmeasure_stderr": 0.0021697142437582524, "rougeLsum_precision": 0.09035808108893525, "rougeLsum_precision_stderr": 0.0033673742748827878, "rougeLsum_recall": 0.06286918923105832, "rougeLsum_recall_stderr": 0.002407780221261254}}, "5": {"tldr_en": {"bleu": 3.0184106792465108e-15, "bleu_stderr": 1.78802158675685e-13, "rouge1_fmeasure": 0.00991335048881209, "rouge1_fmeasure_stderr": 0.0009630926105753386, "rouge1_precision": 0.015768279475199848, "rouge1_precision_stderr": 0.0015994817709509021, "rouge1_recall": 0.010208407683374372, "rouge1_recall_stderr": 0.0011101868759327263, "rouge2_fmeasure": 0.0029524028339129186, "rouge2_fmeasure_stderr": 0.00041005985526957695, "rouge2_precision": 0.005050282757985984, "rouge2_precision_stderr": 0.0007965820847066382, "rouge2_recall": 0.003233565984990156, "rouge2_recall_stderr": 0.0005539676407167323, "rougeL_fmeasure": 0.008248037341762705, "rougeL_fmeasure_stderr": 0.000807006653030911, "rougeL_precision": 0.013277476442981736, "rougeL_precision_stderr": 0.0013881298634661826, "rougeL_recall": 0.008616852733661218, "rougeL_recall_stderr": 0.0009635704353861369, "rougeLsum_fmeasure": 0.009376779842315807, "rougeLsum_fmeasure_stderr": 0.0009193911532830411, "rougeLsum_precision": 0.014923565167243124, "rougeLsum_precision_stderr": 0.0015288899364523146, "rougeLsum_recall": 0.009697966584991889, "rougeLsum_recall_stderr": 0.001073756967121187}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.205452750701432, "bleu_stderr": 0.029369723752371055, "rouge1_fmeasure": 0.2026666683966741, "rouge1_fmeasure_stderr": 0.001854044988549804, "rouge1_precision": 0.18794203900806028, "rouge1_precision_stderr": 0.002509182484953893, "rouge1_recall": 0.27563499652889517, "rouge1_recall_stderr": 0.002522516313059265, "rouge2_fmeasure": 0.054594037237788585, "rouge2_fmeasure_stderr": 0.0010529893200416562, "rouge2_precision": 0.04736777911623166, "rouge2_precision_stderr": 0.0009912253924337147, "rouge2_recall": 0.07529125273517918, "rouge2_recall_stderr": 0.0015129136922472945, "rougeL_fmeasure": 0.17465641737008017, "rougeL_fmeasure_stderr": 0.0014217853180858962, "rougeL_precision": 0.16091347621914998, "rougeL_precision_stderr": 0.0020468972719200795, "rougeL_recall": 0.24052353002182375, "rougeL_recall_stderr": 0.002110543832931725, "rougeLsum_fmeasure": 0.1754467264102351, "rougeLsum_fmeasure_stderr": 0.001775758147384932, "rougeLsum_precision": 0.162408167198493, "rougeLsum_precision_stderr": 0.0022254242193644625, "rougeLsum_recall": 0.23809586470877792, "rougeLsum_recall_stderr": 0.0024056082119999767}}, "1": {"generate_text_restaurant": {"bleu": 11.314986164630461, "bleu_stderr": 0.09886007247543747, "rouge1_fmeasure": 0.45168976701280994, "rouge1_fmeasure_stderr": 0.0023803410292144262, "rouge1_precision": 0.5405367881450536, "rouge1_precision_stderr": 0.0031944831139177473, "rouge1_recall": 0.4265799747472657, "rouge1_recall_stderr": 0.0030051178365182673, "rouge2_fmeasure": 0.2086255432248638, "rouge2_fmeasure_stderr": 0.002005283814444169, "rouge2_precision": 0.2530645509899577, "rouge2_precision_stderr": 0.0025820188553003945, "rouge2_recall": 0.19690559018039472, "rouge2_recall_stderr": 0.0021546256249304957, "rougeL_fmeasure": 0.32671702545878634, "rougeL_fmeasure_stderr": 0.0020575751841782584, "rougeL_precision": 0.3941062713485247, "rougeL_precision_stderr": 0.0028671074248121063, "rougeL_recall": 0.307687838606323, "rougeL_recall_stderr": 0.0024236181103855607, "rougeLsum_fmeasure": 0.3682903569907333, "rougeLsum_fmeasure_stderr": 0.0023311514221899116, "rougeLsum_precision": 0.44239922810973775, "rougeLsum_precision_stderr": 0.0031220549443630044, "rougeLsum_recall": 0.3472525108890491, "rougeLsum_recall_stderr": 0.00274280048405182}}, "2": {"generate_text_restaurant": {"bleu": 13.825382196007423, "bleu_stderr": 0.14341892864265515, "rouge1_fmeasure": 0.47949895116223057, "rouge1_fmeasure_stderr": 0.002274496687632488, "rouge1_precision": 0.5584732398385521, "rouge1_precision_stderr": 0.0031726923154704037, "rouge1_recall": 0.45947280161409504, "rouge1_recall_stderr": 0.002947956873468934, "rouge2_fmeasure": 0.234132554236615, "rouge2_fmeasure_stderr": 0.002050727643013208, "rouge2_precision": 0.27533931731718275, "rouge2_precision_stderr": 0.0026239982120668894, "rouge2_recall": 0.2249642538757707, "rouge2_recall_stderr": 0.002262594422709972, "rougeL_fmeasure": 0.35427830623229684, "rougeL_fmeasure_stderr": 0.0020644801949430084, "rougeL_precision": 0.4144330916795227, "rougeL_precision_stderr": 0.002865466880028744, "rougeL_recall": 0.3391232820910882, "rougeL_recall_stderr": 0.002487675496155266, "rougeLsum_fmeasure": 0.39980527255607845, "rougeLsum_fmeasure_stderr": 0.002302036382207211, "rougeLsum_precision": 0.46586823562780794, "rougeLsum_precision_stderr": 0.0030764860572737814, "rougeLsum_recall": 0.38321253954384926, "rougeLsum_recall_stderr": 0.002786697010998289}}, "3": {"generate_text_restaurant": {"bleu": 14.734710333150248, "bleu_stderr": 0.13327591335822025, "rouge1_fmeasure": 0.49284383237521995, "rouge1_fmeasure_stderr": 0.0022741384691834527, "rouge1_precision": 0.566363810944366, "rouge1_precision_stderr": 0.0031329176207456266, "rouge1_recall": 0.4738543349642313, "rouge1_recall_stderr": 0.002916750104399688, "rouge2_fmeasure": 0.24473795777586516, "rouge2_fmeasure_stderr": 0.0021021762412204888, "rouge2_precision": 0.2835262882274797, "rouge2_precision_stderr": 0.002620037892529304, "rouge2_recall": 0.23582515668443185, "rouge2_recall_stderr": 0.002307150627998116, "rougeL_fmeasure": 0.364532741901036, "rougeL_fmeasure_stderr": 0.002119688594264684, "rougeL_precision": 0.4201683174467468, "rougeL_precision_stderr": 0.0028464798891116223, "rougeL_recall": 0.3504188695034407, "rougeL_recall_stderr": 0.0025288933574772144, "rougeLsum_fmeasure": 0.4121332244451046, "rougeLsum_fmeasure_stderr": 0.0023416760964820984, "rougeLsum_precision": 0.4736089581434219, "rougeLsum_precision_stderr": 0.00306065010387841, "rougeLsum_recall": 0.396405785185109, "rougeLsum_recall_stderr": 0.002794473833800316}}, "4": {"generate_text_restaurant": {"bleu": 15.080804791535655, "bleu_stderr": 0.14821971591156646, "rouge1_fmeasure": 0.4935313098839454, "rouge1_fmeasure_stderr": 0.002271845302152271, "rouge1_precision": 0.5624928789321237, "rouge1_precision_stderr": 0.0031294366446319713, "rouge1_recall": 0.47592863926028084, "rouge1_recall_stderr": 0.0028908116130151665, "rouge2_fmeasure": 0.2467411297660491, "rouge2_fmeasure_stderr": 0.002101346812765226, "rouge2_precision": 0.2834307304449863, "rouge2_precision_stderr": 0.0026187903085809257, "rouge2_recall": 0.2386118604584549, "rouge2_recall_stderr": 0.0023078935639056814, "rougeL_fmeasure": 0.36587999086871936, "rougeL_fmeasure_stderr": 0.0021050945906898252, "rougeL_precision": 0.4178299399327378, "rougeL_precision_stderr": 0.0028182273866756095, "rougeL_recall": 0.35288616277467044, "rougeL_recall_stderr": 0.002503800280122615, "rougeLsum_fmeasure": 0.4149830902149149, "rougeLsum_fmeasure_stderr": 0.0023565846719075163, "rougeLsum_precision": 0.4726074434579816, "rougeLsum_precision_stderr": 0.0030529008210380413, "rougeLsum_recall": 0.40030956626841174, "rougeLsum_recall_stderr": 0.002788731746840601}}, "5": {"generate_text_restaurant": {"bleu": 15.265600470980436, "bleu_stderr": 0.20285527803005138, "rouge1_fmeasure": 0.49535698221178465, "rouge1_fmeasure_stderr": 0.0022639532726268564, "rouge1_precision": 0.5617758997558588, "rouge1_precision_stderr": 0.0031160645760995214, "rouge1_recall": 0.477780058424707, "rouge1_recall_stderr": 0.0028609800200870505, "rouge2_fmeasure": 0.24925086960246148, "rouge2_fmeasure_stderr": 0.002125288231383812, "rouge2_precision": 0.2848892339570606, "rouge2_precision_stderr": 0.0026387395366262536, "rouge2_recall": 0.24077912408729735, "rouge2_recall_stderr": 0.0023046951728657333, "rougeL_fmeasure": 0.3687908505277805, "rougeL_fmeasure_stderr": 0.0021558916218128877, "rougeL_precision": 0.4192693573282302, "rougeL_precision_stderr": 0.002855212675860218, "rougeL_recall": 0.35537258514643544, "rougeL_recall_stderr": 0.0025017841926330076, "rougeLsum_fmeasure": 0.4181866743293669, "rougeLsum_fmeasure_stderr": 0.002368198674279566, "rougeLsum_precision": 0.4743649010098781, "rougeLsum_precision_stderr": 0.003080316334291657, "rougeLsum_recall": 0.40318795066110624, "rougeLsum_recall_stderr": 0.0027745110069664625}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7911957794729954, "bleu_stderr": 0.1028992752886963, "rouge1_fmeasure": 0.2132476859870225, "rouge1_fmeasure_stderr": 0.002596728123368581, "rouge1_precision": 0.1544803647320562, "rouge1_precision_stderr": 0.001998179323003808, "rouge1_recall": 0.3617501123052072, "rouge1_recall_stderr": 0.004401350843198141, "rouge2_fmeasure": 0.04756359555453488, "rouge2_fmeasure_stderr": 0.0015388801074832958, "rouge2_precision": 0.034104422429782094, "rouge2_precision_stderr": 0.0011296363295058075, "rouge2_recall": 0.08341719921119035, "rouge2_recall_stderr": 0.0027753920629418876, "rougeL_fmeasure": 0.15260364524737308, "rougeL_fmeasure_stderr": 0.0018980795169799797, "rougeL_precision": 0.11047768692113931, "rougeL_precision_stderr": 0.0014728181415206798, "rougeL_recall": 0.26038442217264174, "rougeL_recall_stderr": 0.003368032246087583, "rougeLsum_fmeasure": 0.17085907541578757, "rougeLsum_fmeasure_stderr": 0.00219601338601483, "rougeLsum_precision": 0.1235706334215309, "rougeLsum_precision_stderr": 0.0016790133111005653, "rougeLsum_recall": 0.29156249557144304, "rougeLsum_recall_stderr": 0.003855753079955984}}, "1": {"article_DOC_summary": {"bleu": 1.9369867872208306, "bleu_stderr": 0.13301743127426643, "rouge1_fmeasure": 0.21212307020665455, "rouge1_fmeasure_stderr": 0.002805971174994527, "rouge1_precision": 0.18210955050574165, "rouge1_precision_stderr": 0.0030336588185256538, "rouge1_recall": 0.30765438008746904, "rouge1_recall_stderr": 0.0045764351815007235, "rouge2_fmeasure": 0.04293890964191621, "rouge2_fmeasure_stderr": 0.0016754133257735123, "rouge2_precision": 0.03608017605336464, "rouge2_precision_stderr": 0.0015351821810233588, "rouge2_recall": 0.06530630731492977, "rouge2_recall_stderr": 0.0026678483835833266, "rougeL_fmeasure": 0.15522658921799362, "rougeL_fmeasure_stderr": 0.0020874537466411373, "rougeL_precision": 0.133576425244387, "rougeL_precision_stderr": 0.0022712598584840454, "rougeL_recall": 0.2257975651465562, "rougeL_recall_stderr": 0.0035054216942714362, "rougeLsum_fmeasure": 0.16565749532885793, "rougeLsum_fmeasure_stderr": 0.002253857246658309, "rougeLsum_precision": 0.14117554206112032, "rougeLsum_precision_stderr": 0.0023065404958774274, "rougeLsum_recall": 0.24343402256399355, "rougeLsum_recall_stderr": 0.003941092857511189}}, "2": {"article_DOC_summary": {"bleu": 2.133141577656285, "bleu_stderr": 0.12965820672946443, "rouge1_fmeasure": 0.2277383324826566, "rouge1_fmeasure_stderr": 0.0030573712743888364, "rouge1_precision": 0.2105718627026518, "rouge1_precision_stderr": 0.0035778413560214016, "rouge1_recall": 0.2986830414520048, "rouge1_recall_stderr": 0.004329540379099283, "rouge2_fmeasure": 0.051547066687828734, "rouge2_fmeasure_stderr": 0.0019622251118453704, "rouge2_precision": 0.04843737420278684, "rouge2_precision_stderr": 0.002078874124454135, "rouge2_recall": 0.06777424154476913, "rouge2_recall_stderr": 0.002554380728108249, "rougeL_fmeasure": 0.17058950590239796, "rougeL_fmeasure_stderr": 0.00245256877334995, "rougeL_precision": 0.15888578947684742, "rougeL_precision_stderr": 0.002954896445415003, "rougeL_recall": 0.22275234069768626, "rougeL_recall_stderr": 0.003313554078249958, "rougeLsum_fmeasure": 0.17778653214796106, "rougeLsum_fmeasure_stderr": 0.0025471887620010904, "rougeLsum_precision": 0.16407328548937136, "rougeLsum_precision_stderr": 0.0029472894981933152, "rougeLsum_recall": 0.23520189891677276, "rougeLsum_recall_stderr": 0.0037453526896800695}}, "3": {"article_DOC_summary": {"bleu": 2.2845962769932853, "bleu_stderr": 0.1118763427251602, "rouge1_fmeasure": 0.22352993200471194, "rouge1_fmeasure_stderr": 0.0034158533172611158, "rouge1_precision": 0.21408207074509233, "rouge1_precision_stderr": 0.003911771705487585, "rouge1_recall": 0.27616262040256995, "rouge1_recall_stderr": 0.00445926401302726, "rouge2_fmeasure": 0.0505319645754636, "rouge2_fmeasure_stderr": 0.0019929881863291095, "rouge2_precision": 0.04872713712902665, "rouge2_precision_stderr": 0.0021207481086453045, "rouge2_recall": 0.06294658987847104, "rouge2_recall_stderr": 0.002453274077945568, "rougeL_fmeasure": 0.16637492784414312, "rougeL_fmeasure_stderr": 0.0027123155106531316, "rougeL_precision": 0.159438518303562, "rougeL_precision_stderr": 0.003102061741719361, "rougeL_recall": 0.20607630574913904, "rougeL_recall_stderr": 0.0035265999447064503, "rougeLsum_fmeasure": 0.17215854927610266, "rougeLsum_fmeasure_stderr": 0.00276757297974389, "rougeLsum_precision": 0.16373162683207845, "rougeLsum_precision_stderr": 0.0030891132218923796, "rougeLsum_recall": 0.2156279714811016, "rougeLsum_recall_stderr": 0.003810201919065186}}, "4": {"article_DOC_summary": {"bleu": 0.3124946765538571, "bleu_stderr": 0.07230250867579978, "rouge1_fmeasure": 0.05885821408809552, "rouge1_fmeasure_stderr": 0.0033400021306721392, "rouge1_precision": 0.06392907254960759, "rouge1_precision_stderr": 0.003876201158464289, "rouge1_recall": 0.06750288591690386, "rouge1_recall_stderr": 0.0039913615182408215, "rouge2_fmeasure": 0.012855134170914497, "rouge2_fmeasure_stderr": 0.0011864715171664183, "rouge2_precision": 0.014573840849595909, "rouge2_precision_stderr": 0.0016332201756935874, "rouge2_recall": 0.01510184445883284, "rouge2_recall_stderr": 0.0014126505769374265, "rougeL_fmeasure": 0.04407299444650032, "rougeL_fmeasure_stderr": 0.0025320491349861813, "rougeL_precision": 0.04895381713163094, "rougeL_precision_stderr": 0.0030882253843767114, "rougeL_recall": 0.0501669133523796, "rougeL_recall_stderr": 0.002985189039759907, "rougeLsum_fmeasure": 0.04628126164174725, "rougeLsum_fmeasure_stderr": 0.002645018363815525, "rougeLsum_precision": 0.05093801249895102, "rougeLsum_precision_stderr": 0.0031687581333617715, "rougeLsum_recall": 0.05318442494362772, "rougeLsum_recall_stderr": 0.00319392235102553}}, "5": {"article_DOC_summary": {"bleu": 1.0828965147492013e-39, "bleu_stderr": 2.1508906223493296e-34, "rouge1_fmeasure": 0.0020718557755462, "rouge1_fmeasure_stderr": 0.0005803475030367974, "rouge1_precision": 0.002395128003303927, "rouge1_precision_stderr": 0.000695243693352885, "rouge1_recall": 0.00195574424379908, "rouge1_recall_stderr": 0.0005505695407964535, "rouge2_fmeasure": 0.0001226302404615478, "rouge2_fmeasure_stderr": 7.106807371334149e-05, "rouge2_precision": 0.00013792342491448747, "rouge2_precision_stderr": 7.95881099815707e-05, "rouge2_recall": 0.00011258133899643333, "rouge2_recall_stderr": 6.616712432800021e-05, "rougeL_fmeasure": 0.0014461673142424087, "rougeL_fmeasure_stderr": 0.0003992063154628714, "rougeL_precision": 0.0016797182212470382, "rougeL_precision_stderr": 0.00047877790197405523, "rougeL_recall": 0.0013438433718934864, "rougeL_recall_stderr": 0.00036757969239639373, "rougeLsum_fmeasure": 0.0015096956796575665, "rougeLsum_fmeasure_stderr": 0.00041391116095822775, "rougeLsum_precision": 0.0017248567966735974, "rougeLsum_precision_stderr": 0.0004849887493451295, "rougeLsum_recall": 0.0014510474885315653, "rougeLsum_recall_stderr": 0.0004115402839485218}}}}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..30db2b885b9fea4b2463bcf9d5af106abf38e186
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.4358876645741479,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04475556276641748
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.08601617848273047,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003027222218230607
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.29226306498620885,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005344776046578116
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.11084365567127677,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022963454664604076
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.04041394987853437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002132583963194437
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1415225530229299,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0034803378779995004
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.05146498360167007,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014277295107541841
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.08239970522779808,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029176133842350504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2827601971319833,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00519215142624447
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.10611300272902285,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002132214334314073
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.0815748628057772,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002939098844173709
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.2756279858381666,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0049864009961195395
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.10445493190549919,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00215267434014802
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..39edc3b68e98f6343344ae5b386f38fa112b2318
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.6458190048668558,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03274318464461181
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.13985160542322855,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004374312136397895
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.32007070778063884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004962532451445636
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.16170102806244036,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036142070180597507
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.0694179465016437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002921286815952664
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.15883761787677303,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003480119679954502
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07885240324677927,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0023523131620574156
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12591553956133228,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003854734977633317
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.30061625652088586,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004630766957074663
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.14742576479874345,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0031238502419319854
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.1283694931084644,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003947705918972332
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3021994113995889,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004647780404212457
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1497000837635611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0032201737172114866
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fb387adfaaca5f41991d53830504e2e77f574c3
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.7970016196921748,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04581096081746233
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.18059337363344904,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005339355314623877
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3595991748021704,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004980869877485108
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.19703997895188197,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0042195368281687335
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.09688101950971822,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0036313101633365223
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.18755499121707667,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00376908816872013
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10271084479951895,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0029196673345892295
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.15982668252407492,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004603772659928646
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3351391580099154,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004585332134390738
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1772530360597836,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0035959119026034024
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16383301190827276,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004742860098171715
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.33868792774850437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004634428210263519
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18084941775566396,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037099919587234576
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf2c4a02f9976b7237b29aa4e5d6cf3aed5bb0a2
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9098079216712224,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.049900879662925314
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.19212508744736018,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005617174437077427
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3703600547581712,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004938819064876214
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.20522389412159425,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0043615630325957435
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10484577205978293,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003926715523602227
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19383710059664028,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003742953011901514
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10810375284373198,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003075123686799679
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17083343128882605,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004911086258050898
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34476103034505623,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004533133149997157
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18478247984025434,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003743746721954891
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17532235438945584,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005084911003429309
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3483193020595072,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045873162051077825
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18830268691133512,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003857827801954891
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d72ba65a841017c0cb6ef9ffe930a56da82d231
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.052600188336286,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06665675598684426
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.20108020030540724,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005644545699403245
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3818834779332785,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004965307876320074
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.21317352472996992,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004401402126159575
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11098039808305214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003927637491564618
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.204923577311545,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003948475544315969
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.1144786865204847,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003126094675218737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1772302061036221,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0048597986989461165
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.35524087803093607,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004631433081108199
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19087293738824496,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037489593597534975
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.183821519223083,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005085692199041701
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.36063124635272004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004673788271599251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19652423173229852,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003927577783905062
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..742414f4c299e217caf6898d4c7f281221473e62
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0960931080747824,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06344656593457922
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.21325470731107665,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005803174587983953
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3950162351216541,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004977496708782323
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.22442306072164447,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004490024810310396
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11715823165872309,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004000375023616823
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.2112966644324947,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003874637116812206
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11914106590715341,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003087212545803013
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1866801247482292,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004986038924327078
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.36529275376302506,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0045911093135183
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19921393354497344,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037728930475513494
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.1931918251374917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00519492169870331
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3699764207220656,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004618932928096925
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.20449901465956502,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003922872033805495
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..955f9cac7f794f14438ed6456a1f652e6b957cd9
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.122915905754568,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0027293510115185423
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1767818917839268,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033841479310980638
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.12988512607981967,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024439343174664525
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.026806616635118036,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008667140586963389
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04199675102883295,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014222033630357748
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03000716115226249,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009354733970545372
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.0938740707460975,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002177005796825213
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1374323483251746,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00266526479478189
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.09879731353636668,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001799103726263371
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.11485464277414363,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002596139008897079
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.16449395724493923,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031654167638919778
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.12094018141973355,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002284523760450248
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.278917009551764,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09140957868815724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..da20bea59cd064190f0e0d7219bcdd7ed7d7032c
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2321033086693269,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003444990523939608
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.21218959051209052,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028780222957239345
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.18879633772816867,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002216973861149929
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.05986007346081548,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0021311948977268975
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.048915032760613236,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014864819960528294
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.04395688090556475,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012580561387008646
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.18107984090374726,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028960619334779066
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.162566462675585,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002220609266556534
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.14456965085181128,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017115993134306947
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2192803166896013,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033059632616290542
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19959677205987747,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002694200019026103
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1776723900847922,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020872816698884362
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.707464673902901,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1122035248088363
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9eea86531b8eec5e9c73392b9dc70a46d4a6135e
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.3147030364809968,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037689305738243476
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.25097433153568405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028925947424101703
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.23558432539564403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021996425715538554
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09338720402263002,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002368756352904399
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06942686553346078,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016202813479301297
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06557325455887651,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013735424734152642
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.244935161158131,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003198495702253031
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1911576784626261,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002276055119508395
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17976921379978278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001722115310366516
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2958662208042827,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003600934016250172
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.23572659332760645,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002741901482334439
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.22099937310878076,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002075646266476727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.7904798767943335,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11122294896753164
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c25c09358f1dad874de09f2ecbb51e222e27380
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2754936136179268,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004157379347035767
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.20528413095627832,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003149288824376793
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19838663921126778,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002621759128746396
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08256853694746276,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024364087557486665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.05727334409426975,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015675844642289158
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.055968236855402816,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014034751658766104
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.21625894747073696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034653747339702624
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15717871923447801,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00246051792291125
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.15243563853997083,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020406692721428466
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2591405389486435,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003962509253114959
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19231398442318168,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029662256128315167
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18589041350629654,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024658633555113495
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.8632324313744557,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09299178906804655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac76affab2314fe5000a849d555ada9c0bbd6f31
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.09597320792153687,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003523744199921863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.06743171611131621,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0025672971207144984
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06581475266895565,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002308785748398076
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.03073553158490422,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018205157024659981
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.02032186815564804,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011741990653970874
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.019691124849550927,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001032704031652877
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07767834454496185,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002968257043282438
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.05327838136711029,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0020669018184854763
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.05209993435974213,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018508808521006768
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.09035808108893525,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033673742748827878
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.06286918923105832,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002407780221261254
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.06145306117564354,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021697142437582524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.08259030504562871,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0071288759428394894
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2159ef5c5dd9e9714bc78d0a0418ede3ff8e4ba9
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.015768279475199848,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015994817709509021
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.010208407683374372,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0011101868759327263
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.00991335048881209,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009630926105753386
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.005050282757985984,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007965820847066382
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.003233565984990156,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005539676407167323
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0029524028339129186,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00041005985526957695
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.013277476442981736,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013881298634661826
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.008616852733661218,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0009635704353861369
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008248037341762705,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000807006653030911
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.014923565167243124,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015288899364523146
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.009697966584991889,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001073756967121187
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009376779842315807,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009193911532830411
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.0184106792465108e-15,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.78802158675685e-13
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5b2ad5f20f64e31f634443811140a88e1f57ac6
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 3.205452750701432,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.029369723752371055
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.18794203900806028,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002509182484953893
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.27563499652889517,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002522516313059265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.2026666683966741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001854044988549804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.04736777911623166,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009912253924337147
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.07529125273517918,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015129136922472945
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.054594037237788585,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010529893200416562
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.16091347621914998,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0020468972719200795
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.24052353002182375,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002110543832931725
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.17465641737008017,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014217853180858962
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.162408167198493,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0022254242193644625
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.23809586470877792,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024056082119999767
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.1754467264102351,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001775758147384932
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..04d0f7288703cce7f291923f4d1a868a8fc74478
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 11.314986164630461,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.09886007247543747
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5405367881450536,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031944831139177473
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4265799747472657,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030051178365182673
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.45168976701280994,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023803410292144262
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2530645509899577,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025820188553003945
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.19690559018039472,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021546256249304957
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2086255432248638,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002005283814444169
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.3941062713485247,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028671074248121063
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.307687838606323,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024236181103855607
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.32671702545878634,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020575751841782584
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.44239922810973775,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031220549443630044
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3472525108890491,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00274280048405182
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.3682903569907333,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023311514221899116
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dc5d6580609911da2a0002a11d4c714ef54e7fc
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 13.825382196007423,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14341892864265515
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5584732398385521,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031726923154704037
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.45947280161409504,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002947956873468934
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.47949895116223057,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002274496687632488
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.27533931731718275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026239982120668894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2249642538757707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002262594422709972
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.234132554236615,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002050727643013208
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4144330916795227,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002865466880028744
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3391232820910882,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002487675496155266
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.35427830623229684,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020644801949430084
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.46586823562780794,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030764860572737814
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.38321253954384926,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002786697010998289
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.39980527255607845,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002302036382207211
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6ab311f82e2c52f6eada2b47d5adb66e9f3e794
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.734710333150248,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.13327591335822025
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.566363810944366,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031329176207456266
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4738543349642313,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002916750104399688
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49284383237521995,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022741384691834527
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2835262882274797,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002620037892529304
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23582515668443185,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002307150627998116
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24473795777586516,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021021762412204888
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4201683174467468,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028464798891116223
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3504188695034407,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025288933574772144
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.364532741901036,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002119688594264684
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4736089581434219,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00306065010387841
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.396405785185109,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002794473833800316
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4121332244451046,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023416760964820984
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9ee1529055b67788a62a7bd5ff66f017e700509
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.080804791535655,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14821971591156646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5624928789321237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031294366446319713
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47592863926028084,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028908116130151665
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4935313098839454,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002271845302152271
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2834307304449863,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026187903085809257
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2386118604584549,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023078935639056814
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2467411297660491,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002101346812765226
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4178299399327378,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028182273866756095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35288616277467044,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002503800280122615
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36587999086871936,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021050945906898252
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4726074434579816,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030529008210380413
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.40030956626841174,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002788731746840601
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4149830902149149,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023565846719075163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bb85c0c290aab2f07e44e7a9959707f8de851e4
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.265600470980436,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20285527803005138
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5617758997558588,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031160645760995214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.477780058424707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028609800200870505
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49535698221178465,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022639532726268564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2848892339570606,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026387395366262536
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24077912408729735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023046951728657333
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24925086960246148,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002125288231383812
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4192693573282302,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002855212675860218
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35537258514643544,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025017841926330076
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3687908505277805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021558916218128877
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4743649010098781,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003080316334291657
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.40318795066110624,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027745110069664625
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4181866743293669,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002368198674279566
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_0.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5422fc5b9e88cf81cd8de0ec9b20ce69b1adb753
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1544803647320562,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001998179323003808
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.3617501123052072,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004401350843198141
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2132476859870225,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002596728123368581
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.034104422429782094,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011296363295058075
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.08341719921119035,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0027753920629418876
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04756359555453488,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015388801074832958
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.11047768692113931,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0014728181415206798
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.26038442217264174,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003368032246087583
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15260364524737308,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018980795169799797
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1235706334215309,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0016790133111005653
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.29156249557144304,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003855753079955984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17085907541578757,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00219601338601483
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7911957794729954,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1028992752886963
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_1.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2aa9f3998a44f32b13ec56debc55dc8adff3d331
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.18210955050574165,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0030336588185256538
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.30765438008746904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0045764351815007235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21212307020665455,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002805971174994527
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.03608017605336464,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0015351821810233588
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06530630731492977,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0026678483835833266
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04293890964191621,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016754133257735123
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.133576425244387,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0022712598584840454
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.2257975651465562,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035054216942714362
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15522658921799362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0020874537466411373
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.14117554206112032,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0023065404958774274
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.24343402256399355,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003941092857511189
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16565749532885793,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002253857246658309
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.9369867872208306,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13301743127426643
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_2.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f3847b288a0a2d37e0fc250fafe1b45e4b4628e
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.2105718627026518,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0035778413560214016
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2986830414520048,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004329540379099283
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2277383324826566,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0030573712743888364
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04843737420278684,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002078874124454135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06777424154476913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002554380728108249
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.051547066687828734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019622251118453704
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.15888578947684742,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002954896445415003
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.22275234069768626,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003313554078249958
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.17058950590239796,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00245256877334995
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16407328548937136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0029472894981933152
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.23520189891677276,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037453526896800695
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17778653214796106,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0025471887620010904
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.133141577656285,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.12965820672946443
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_3.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..56fdc8e0520e2c7ded6051768f86e4bcb94ebdbe
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.21408207074509233,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003911771705487585
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.27616262040256995,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00445926401302726
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.22352993200471194,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034158533172611158
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04872713712902665,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0021207481086453045
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06294658987847104,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002453274077945568
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0505319645754636,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019929881863291095
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.159438518303562,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003102061741719361
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.20607630574913904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035265999447064503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16637492784414312,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027123155106531316
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16373162683207845,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0030891132218923796
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.2156279714811016,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003810201919065186
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17215854927610266,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00276757297974389
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.2845962769932853,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1118763427251602
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_4.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae3f2181c3f30d58a7e0776e04d221bd97a41218
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.06392907254960759,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003876201158464289
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.06750288591690386,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0039913615182408215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05885821408809552,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033400021306721392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.014573840849595909,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016332201756935874
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.01510184445883284,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014126505769374265
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.012855134170914497,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011864715171664183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.04895381713163094,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0030882253843767114
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0501669133523796,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002985189039759907
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04407299444650032,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025320491349861813
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05093801249895102,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0031687581333617715
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.05318442494362772,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00319392235102553
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04628126164174725,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002645018363815525
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.3124946765538571,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07230250867579978
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_5.json b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0d6a314006334d64bc576a0db54b68b081263ae
--- /dev/null
+++ b/4b284b12boscar/evaluation/generation/slim.4b284b12boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.002395128003303927,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.000695243693352885
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.00195574424379908,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005505695407964535
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0020718557755462,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005803475030367974
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.00013792342491448747,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 7.95881099815707e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00011258133899643333,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 6.616712432800021e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0001226302404615478,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 7.106807371334149e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0016797182212470382,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00047877790197405523
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0013438433718934864,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00036757969239639373
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0014461673142424087,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0003992063154628714
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0017248567966735974,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0004849887493451295
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0014510474885315653,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0004115402839485218
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0015096956796575665,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00041391116095822775
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.0828965147492013e-39,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 2.1508906223493296e-34
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b12boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_0.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e94f4b6802ccb80616481c880f3e2f6ef17fbf7
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_0.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.324,
+            "acc_stderr": 0.01480686473373886
+        },
+        "anli_r2": {
+            "acc": 0.327,
+            "acc_stderr": 0.014842213153411239
+        },
+        "anli_r3": {
+            "acc": 0.34,
+            "acc_stderr": 0.013680495725767785
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.3806146572104019
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.045126085985421276
+        },
+        "hellaswag": {
+            "acc": 0.4027086237801235,
+            "acc_stderr": 0.004894407257215796,
+            "acc_norm": 0.5084644493128859,
+            "acc_norm_stderr": 0.004989066355449556
+        },
+        "rte": {
+            "acc": 0.5270758122743683,
+            "acc_stderr": 0.030052303463143706
+        },
+        "winogrande": {
+            "acc": 0.5595895816890292,
+            "acc_stderr": 0.01395233031191561
+        },
+        "storycloze_2016": {
+            "acc": 0.6755745590593266,
+            "acc_stderr": 0.010826131344990888
+        },
+        "boolq": {
+            "acc": 0.5608562691131499,
+            "acc_stderr": 0.008680038923540374
+        },
+        "arc_easy": {
+            "acc": 0.5488215488215489,
+            "acc_stderr": 0.010210757101073482,
+            "acc_norm": 0.48863636363636365,
+            "acc_norm_stderr": 0.010257133441117115
+        },
+        "arc_challenge": {
+            "acc": 0.23378839590443687,
+            "acc_stderr": 0.01236822537850715,
+            "acc_norm": 0.27047781569965873,
+            "acc_norm_stderr": 0.012980954547659554
+        },
+        "sciq": {
+            "acc": 0.815,
+            "acc_stderr": 0.012285191326386696,
+            "acc_norm": 0.724,
+            "acc_norm_stderr": 0.014142984975740668
+        },
+        "piqa": {
+            "acc": 0.7187159956474428,
+            "acc_stderr": 0.010490509832327423,
+            "acc_norm": 0.7143634385201306,
+            "acc_norm_stderr": 0.010539303948661915
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e94f4b6802ccb80616481c880f3e2f6ef17fbf7
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.324,
+            "acc_stderr": 0.01480686473373886
+        },
+        "anli_r2": {
+            "acc": 0.327,
+            "acc_stderr": 0.014842213153411239
+        },
+        "anli_r3": {
+            "acc": 0.34,
+            "acc_stderr": 0.013680495725767785
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.3806146572104019
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.045126085985421276
+        },
+        "hellaswag": {
+            "acc": 0.4027086237801235,
+            "acc_stderr": 0.004894407257215796,
+            "acc_norm": 0.5084644493128859,
+            "acc_norm_stderr": 0.004989066355449556
+        },
+        "rte": {
+            "acc": 0.5270758122743683,
+            "acc_stderr": 0.030052303463143706
+        },
+        "winogrande": {
+            "acc": 0.5595895816890292,
+            "acc_stderr": 0.01395233031191561
+        },
+        "storycloze_2016": {
+            "acc": 0.6755745590593266,
+            "acc_stderr": 0.010826131344990888
+        },
+        "boolq": {
+            "acc": 0.5608562691131499,
+            "acc_stderr": 0.008680038923540374
+        },
+        "arc_easy": {
+            "acc": 0.5488215488215489,
+            "acc_stderr": 0.010210757101073482,
+            "acc_norm": 0.48863636363636365,
+            "acc_norm_stderr": 0.010257133441117115
+        },
+        "arc_challenge": {
+            "acc": 0.23378839590443687,
+            "acc_stderr": 0.01236822537850715,
+            "acc_norm": 0.27047781569965873,
+            "acc_norm_stderr": 0.012980954547659554
+        },
+        "sciq": {
+            "acc": 0.815,
+            "acc_stderr": 0.012285191326386696,
+            "acc_norm": 0.724,
+            "acc_norm_stderr": 0.014142984975740668
+        },
+        "piqa": {
+            "acc": 0.7187159956474428,
+            "acc_stderr": 0.010490509832327423,
+            "acc_norm": 0.7143634385201306,
+            "acc_norm_stderr": 0.010539303948661915
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_1.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1de4149c31ad4f9ad6f4ec96ebd4f3dc7b7561f6
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_1.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.324,
+            "acc_stderr": 0.014806864733738859
+        },
+        "anli_r2": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095526
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.013596836729485163
+        },
+        "cb": {
+            "acc": 0.4107142857142857,
+            "acc_stderr": 0.0663363415035954,
+            "f1": 0.3299501424501424
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.04512608598542127
+        },
+        "hellaswag": {
+            "acc": 0.40091615216092413,
+            "acc_stderr": 0.004890824718530301,
+            "acc_norm": 0.5123481378211512,
+            "acc_norm_stderr": 0.0049882595304724655
+        },
+        "rte": {
+            "acc": 0.5306859205776173,
+            "acc_stderr": 0.03003973059219781
+        },
+        "winogrande": {
+            "acc": 0.5461720599842147,
+            "acc_stderr": 0.013992441563707063
+        },
+        "storycloze_2016": {
+            "acc": 0.6632816675574559,
+            "acc_stderr": 0.010928525619392454
+        },
+        "boolq": {
+            "acc": 0.5525993883792049,
+            "acc_stderr": 0.008696530539281539
+        },
+        "arc_easy": {
+            "acc": 0.5837542087542088,
+            "acc_stderr": 0.010114819404500866,
+            "acc_norm": 0.5526094276094277,
+            "acc_norm_stderr": 0.01020283238541565
+        },
+        "arc_challenge": {
+            "acc": 0.25426621160409557,
+            "acc_stderr": 0.012724999945157746,
+            "acc_norm": 0.2773037542662116,
+            "acc_norm_stderr": 0.013082095839059374
+        },
+        "sciq": {
+            "acc": 0.883,
+            "acc_stderr": 0.010169287802713329,
+            "acc_norm": 0.874,
+            "acc_norm_stderr": 0.010499249222408046
+        },
+        "piqa": {
+            "acc": 0.7274211099020674,
+            "acc_stderr": 0.010389256803296021,
+            "acc_norm": 0.720348204570185,
+            "acc_norm_stderr": 0.010471899530306559
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..1de4149c31ad4f9ad6f4ec96ebd4f3dc7b7561f6
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.324,
+            "acc_stderr": 0.014806864733738859
+        },
+        "anli_r2": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095526
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.013596836729485163
+        },
+        "cb": {
+            "acc": 0.4107142857142857,
+            "acc_stderr": 0.0663363415035954,
+            "f1": 0.3299501424501424
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.04512608598542127
+        },
+        "hellaswag": {
+            "acc": 0.40091615216092413,
+            "acc_stderr": 0.004890824718530301,
+            "acc_norm": 0.5123481378211512,
+            "acc_norm_stderr": 0.0049882595304724655
+        },
+        "rte": {
+            "acc": 0.5306859205776173,
+            "acc_stderr": 0.03003973059219781
+        },
+        "winogrande": {
+            "acc": 0.5461720599842147,
+            "acc_stderr": 0.013992441563707063
+        },
+        "storycloze_2016": {
+            "acc": 0.6632816675574559,
+            "acc_stderr": 0.010928525619392454
+        },
+        "boolq": {
+            "acc": 0.5525993883792049,
+            "acc_stderr": 0.008696530539281539
+        },
+        "arc_easy": {
+            "acc": 0.5837542087542088,
+            "acc_stderr": 0.010114819404500866,
+            "acc_norm": 0.5526094276094277,
+            "acc_norm_stderr": 0.01020283238541565
+        },
+        "arc_challenge": {
+            "acc": 0.25426621160409557,
+            "acc_stderr": 0.012724999945157746,
+            "acc_norm": 0.2773037542662116,
+            "acc_norm_stderr": 0.013082095839059374
+        },
+        "sciq": {
+            "acc": 0.883,
+            "acc_stderr": 0.010169287802713329,
+            "acc_norm": 0.874,
+            "acc_norm_stderr": 0.010499249222408046
+        },
+        "piqa": {
+            "acc": 0.7274211099020674,
+            "acc_stderr": 0.010389256803296021,
+            "acc_norm": 0.720348204570185,
+            "acc_norm_stderr": 0.010471899530306559
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_2.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..90d78ec276ec041123b836dbb756a6fc3a021372
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_2.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.332,
+            "acc_stderr": 0.014899597242811478
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224487
+        },
+        "anli_r3": {
+            "acc": 0.33916666666666667,
+            "acc_stderr": 0.013672343491681813
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.2570314675577834
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768079
+        },
+        "hellaswag": {
+            "acc": 0.40151364270065726,
+            "acc_stderr": 0.004892026457294707,
+            "acc_norm": 0.5150368452499502,
+            "acc_norm_stderr": 0.004987524454849712
+        },
+        "rte": {
+            "acc": 0.51985559566787,
+            "acc_stderr": 0.030072723167317184
+        },
+        "winogrande": {
+            "acc": 0.5438042620363063,
+            "acc_stderr": 0.013998453610924324
+        },
+        "storycloze_2016": {
+            "acc": 0.6654195617316943,
+            "acc_stderr": 0.010911318967127935
+        },
+        "boolq": {
+            "acc": 0.5431192660550459,
+            "acc_stderr": 0.008712475433089477
+        },
+        "arc_easy": {
+            "acc": 0.5942760942760943,
+            "acc_stderr": 0.010075755540128873,
+            "acc_norm": 0.5782828282828283,
+            "acc_norm_stderr": 0.010133255284012323
+        },
+        "arc_challenge": {
+            "acc": 0.25853242320819114,
+            "acc_stderr": 0.012794553754288686,
+            "acc_norm": 0.2841296928327645,
+            "acc_norm_stderr": 0.013179442447653886
+        },
+        "sciq": {
+            "acc": 0.89,
+            "acc_stderr": 0.009899393819724442,
+            "acc_norm": 0.89,
+            "acc_norm_stderr": 0.009899393819724453
+        },
+        "piqa": {
+            "acc": 0.7257889009793254,
+            "acc_stderr": 0.010408618664933382,
+            "acc_norm": 0.7170837867247007,
+            "acc_norm_stderr": 0.010508949177489678
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..90d78ec276ec041123b836dbb756a6fc3a021372
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.332,
+            "acc_stderr": 0.014899597242811478
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224487
+        },
+        "anli_r3": {
+            "acc": 0.33916666666666667,
+            "acc_stderr": 0.013672343491681813
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.2570314675577834
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768079
+        },
+        "hellaswag": {
+            "acc": 0.40151364270065726,
+            "acc_stderr": 0.004892026457294707,
+            "acc_norm": 0.5150368452499502,
+            "acc_norm_stderr": 0.004987524454849712
+        },
+        "rte": {
+            "acc": 0.51985559566787,
+            "acc_stderr": 0.030072723167317184
+        },
+        "winogrande": {
+            "acc": 0.5438042620363063,
+            "acc_stderr": 0.013998453610924324
+        },
+        "storycloze_2016": {
+            "acc": 0.6654195617316943,
+            "acc_stderr": 0.010911318967127935
+        },
+        "boolq": {
+            "acc": 0.5431192660550459,
+            "acc_stderr": 0.008712475433089477
+        },
+        "arc_easy": {
+            "acc": 0.5942760942760943,
+            "acc_stderr": 0.010075755540128873,
+            "acc_norm": 0.5782828282828283,
+            "acc_norm_stderr": 0.010133255284012323
+        },
+        "arc_challenge": {
+            "acc": 0.25853242320819114,
+            "acc_stderr": 0.012794553754288686,
+            "acc_norm": 0.2841296928327645,
+            "acc_norm_stderr": 0.013179442447653886
+        },
+        "sciq": {
+            "acc": 0.89,
+            "acc_stderr": 0.009899393819724442,
+            "acc_norm": 0.89,
+            "acc_norm_stderr": 0.009899393819724453
+        },
+        "piqa": {
+            "acc": 0.7257889009793254,
+            "acc_stderr": 0.010408618664933382,
+            "acc_norm": 0.7170837867247007,
+            "acc_norm_stderr": 0.010508949177489678
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_3.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bc815a2c497f6419071d6c1f1a18adc33980157
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_3.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.322,
+            "acc_stderr": 0.014782913600996683
+        },
+        "anli_r2": {
+            "acc": 0.341,
+            "acc_stderr": 0.014998131348402699
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.01359683672948516
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942398,
+            "f1": 0.3560833560833561
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394
+        },
+        "hellaswag": {
+            "acc": 0.4025094602668791,
+            "acc_stderr": 0.004894012555642636,
+            "acc_norm": 0.5155347540330611,
+            "acc_norm_stderr": 0.004987372476207029
+        },
+        "rte": {
+            "acc": 0.5379061371841155,
+            "acc_stderr": 0.030009848912529117
+        },
+        "winogrande": {
+            "acc": 0.55327545382794,
+            "acc_stderr": 0.013972488371616687
+        },
+        "storycloze_2016": {
+            "acc": 0.677712453233565,
+            "acc_stderr": 0.010807461374996361
+        },
+        "boolq": {
+            "acc": 0.5311926605504587,
+            "acc_stderr": 0.008728020822889253
+        },
+        "arc_easy": {
+            "acc": 0.5858585858585859,
+            "acc_stderr": 0.010107387673002531,
+            "acc_norm": 0.571969696969697,
+            "acc_norm_stderr": 0.01015294331642626
+        },
+        "arc_challenge": {
+            "acc": 0.2627986348122867,
+            "acc_stderr": 0.012862523175351333,
+            "acc_norm": 0.2935153583617747,
+            "acc_norm_stderr": 0.013307250444941113
+        },
+        "sciq": {
+            "acc": 0.898,
+            "acc_stderr": 0.009575368801653876,
+            "acc_norm": 0.899,
+            "acc_norm_stderr": 0.009533618929340973
+        },
+        "piqa": {
+            "acc": 0.7252448313384113,
+            "acc_stderr": 0.010415033676676037,
+            "acc_norm": 0.7285092491838956,
+            "acc_norm_stderr": 0.010376251176596137
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bc815a2c497f6419071d6c1f1a18adc33980157
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.322,
+            "acc_stderr": 0.014782913600996683
+        },
+        "anli_r2": {
+            "acc": 0.341,
+            "acc_stderr": 0.014998131348402699
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.01359683672948516
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942398,
+            "f1": 0.3560833560833561
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394
+        },
+        "hellaswag": {
+            "acc": 0.4025094602668791,
+            "acc_stderr": 0.004894012555642636,
+            "acc_norm": 0.5155347540330611,
+            "acc_norm_stderr": 0.004987372476207029
+        },
+        "rte": {
+            "acc": 0.5379061371841155,
+            "acc_stderr": 0.030009848912529117
+        },
+        "winogrande": {
+            "acc": 0.55327545382794,
+            "acc_stderr": 0.013972488371616687
+        },
+        "storycloze_2016": {
+            "acc": 0.677712453233565,
+            "acc_stderr": 0.010807461374996361
+        },
+        "boolq": {
+            "acc": 0.5311926605504587,
+            "acc_stderr": 0.008728020822889253
+        },
+        "arc_easy": {
+            "acc": 0.5858585858585859,
+            "acc_stderr": 0.010107387673002531,
+            "acc_norm": 0.571969696969697,
+            "acc_norm_stderr": 0.01015294331642626
+        },
+        "arc_challenge": {
+            "acc": 0.2627986348122867,
+            "acc_stderr": 0.012862523175351333,
+            "acc_norm": 0.2935153583617747,
+            "acc_norm_stderr": 0.013307250444941113
+        },
+        "sciq": {
+            "acc": 0.898,
+            "acc_stderr": 0.009575368801653876,
+            "acc_norm": 0.899,
+            "acc_norm_stderr": 0.009533618929340973
+        },
+        "piqa": {
+            "acc": 0.7252448313384113,
+            "acc_stderr": 0.010415033676676037,
+            "acc_norm": 0.7285092491838956,
+            "acc_norm_stderr": 0.010376251176596137
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_4.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..40e6662cbfe8aa881f7afe294b78ce91dec6a832
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_4.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.333,
+            "acc_stderr": 0.01491084616422987
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224487
+        },
+        "anli_r3": {
+            "acc": 0.3275,
+            "acc_stderr": 0.01355321116725194
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.3895559795009913
+        },
+        "copa": {
+            "acc": 0.7,
+            "acc_stderr": 0.046056618647183814
+        },
+        "hellaswag": {
+            "acc": 0.4008165704043019,
+            "acc_stderr": 0.0048906236932436216,
+            "acc_norm": 0.5142401911969727,
+            "acc_norm_stderr": 0.004987757314769834
+        },
+        "rte": {
+            "acc": 0.5234657039711191,
+            "acc_stderr": 0.03006330041190266
+        },
+        "winogrande": {
+            "acc": 0.5540647198105761,
+            "acc_stderr": 0.013970093482330704
+        },
+        "storycloze_2016": {
+            "acc": 0.6707642971672902,
+            "acc_stderr": 0.010867199207548977
+        },
+        "boolq": {
+            "acc": 0.518960244648318,
+            "acc_stderr": 0.008738765179491934
+        },
+        "arc_easy": {
+            "acc": 0.5980639730639731,
+            "acc_stderr": 0.010060521220920566,
+            "acc_norm": 0.5854377104377104,
+            "acc_norm_stderr": 0.010108889212447783
+        },
+        "arc_challenge": {
+            "acc": 0.26023890784982934,
+            "acc_stderr": 0.012821930225112568,
+            "acc_norm": 0.29266211604095566,
+            "acc_norm_stderr": 0.01329591610361942
+        },
+        "sciq": {
+            "acc": 0.908,
+            "acc_stderr": 0.009144376393151108,
+            "acc_norm": 0.913,
+            "acc_norm_stderr": 0.008916866630745908
+        },
+        "piqa": {
+            "acc": 0.721436343852013,
+            "acc_stderr": 0.010459397235965189,
+            "acc_norm": 0.7219804134929271,
+            "acc_norm_stderr": 0.010453117358332814
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..40e6662cbfe8aa881f7afe294b78ce91dec6a832
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.333,
+            "acc_stderr": 0.01491084616422987
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224487
+        },
+        "anli_r3": {
+            "acc": 0.3275,
+            "acc_stderr": 0.01355321116725194
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.3895559795009913
+        },
+        "copa": {
+            "acc": 0.7,
+            "acc_stderr": 0.046056618647183814
+        },
+        "hellaswag": {
+            "acc": 0.4008165704043019,
+            "acc_stderr": 0.0048906236932436216,
+            "acc_norm": 0.5142401911969727,
+            "acc_norm_stderr": 0.004987757314769834
+        },
+        "rte": {
+            "acc": 0.5234657039711191,
+            "acc_stderr": 0.03006330041190266
+        },
+        "winogrande": {
+            "acc": 0.5540647198105761,
+            "acc_stderr": 0.013970093482330704
+        },
+        "storycloze_2016": {
+            "acc": 0.6707642971672902,
+            "acc_stderr": 0.010867199207548977
+        },
+        "boolq": {
+            "acc": 0.518960244648318,
+            "acc_stderr": 0.008738765179491934
+        },
+        "arc_easy": {
+            "acc": 0.5980639730639731,
+            "acc_stderr": 0.010060521220920566,
+            "acc_norm": 0.5854377104377104,
+            "acc_norm_stderr": 0.010108889212447783
+        },
+        "arc_challenge": {
+            "acc": 0.26023890784982934,
+            "acc_stderr": 0.012821930225112568,
+            "acc_norm": 0.29266211604095566,
+            "acc_norm_stderr": 0.01329591610361942
+        },
+        "sciq": {
+            "acc": 0.908,
+            "acc_stderr": 0.009144376393151108,
+            "acc_norm": 0.913,
+            "acc_norm_stderr": 0.008916866630745908
+        },
+        "piqa": {
+            "acc": 0.721436343852013,
+            "acc_stderr": 0.010459397235965189,
+            "acc_norm": 0.7219804134929271,
+            "acc_norm_stderr": 0.010453117358332814
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_5.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae6b3462dcc782103dd70014b477ef34b9d76709
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_5.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.33,
+            "acc_stderr": 0.014876872027456732
+        },
+        "anli_r2": {
+            "acc": 0.328,
+            "acc_stderr": 0.014853842487270336
+        },
+        "anli_r3": {
+            "acc": 0.34833333333333333,
+            "acc_stderr": 0.013759437498874073
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.25000000000000006
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394
+        },
+        "hellaswag": {
+            "acc": 0.40380402310296754,
+            "acc_stderr": 0.004896563126116814,
+            "acc_norm": 0.5171280621390162,
+            "acc_norm_stderr": 0.004986852842576728
+        },
+        "rte": {
+            "acc": 0.5090252707581228,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5374901341752171,
+            "acc_stderr": 0.014012928183336578
+        },
+        "storycloze_2016": {
+            "acc": 0.677712453233565,
+            "acc_stderr": 0.010807461374996361
+        },
+        "boolq": {
+            "acc": 0.5070336391437309,
+            "acc_stderr": 0.008744189661475107
+        },
+        "arc_easy": {
+            "acc": 0.5959595959595959,
+            "acc_stderr": 0.010069061649549547,
+            "acc_norm": 0.5765993265993266,
+            "acc_norm_stderr": 0.010138671005289049
+        },
+        "arc_challenge": {
+            "acc": 0.2687713310580205,
+            "acc_stderr": 0.01295506596371069,
+            "acc_norm": 0.2909556313993174,
+            "acc_norm_stderr": 0.013273077865907586
+        },
+        "sciq": {
+            "acc": 0.904,
+            "acc_stderr": 0.009320454434783207,
+            "acc_norm": 0.908,
+            "acc_norm_stderr": 0.009144376393151127
+        },
+        "piqa": {
+            "acc": 0.720892274211099,
+            "acc_stderr": 0.010465657948498228,
+            "acc_norm": 0.720892274211099,
+            "acc_norm_stderr": 0.01046565794849823
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/evaluation/rankeval/4b284b12boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae6b3462dcc782103dd70014b477ef34b9d76709
--- /dev/null
+++ b/4b284b12boscar/evaluation/rankeval/4b284b12boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.33,
+            "acc_stderr": 0.014876872027456732
+        },
+        "anli_r2": {
+            "acc": 0.328,
+            "acc_stderr": 0.014853842487270336
+        },
+        "anli_r3": {
+            "acc": 0.34833333333333333,
+            "acc_stderr": 0.013759437498874073
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.25000000000000006
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394
+        },
+        "hellaswag": {
+            "acc": 0.40380402310296754,
+            "acc_stderr": 0.004896563126116814,
+            "acc_norm": 0.5171280621390162,
+            "acc_norm_stderr": 0.004986852842576728
+        },
+        "rte": {
+            "acc": 0.5090252707581228,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5374901341752171,
+            "acc_stderr": 0.014012928183336578
+        },
+        "storycloze_2016": {
+            "acc": 0.677712453233565,
+            "acc_stderr": 0.010807461374996361
+        },
+        "boolq": {
+            "acc": 0.5070336391437309,
+            "acc_stderr": 0.008744189661475107
+        },
+        "arc_easy": {
+            "acc": 0.5959595959595959,
+            "acc_stderr": 0.010069061649549547,
+            "acc_norm": 0.5765993265993266,
+            "acc_norm_stderr": 0.010138671005289049
+        },
+        "arc_challenge": {
+            "acc": 0.2687713310580205,
+            "acc_stderr": 0.01295506596371069,
+            "acc_norm": 0.2909556313993174,
+            "acc_norm_stderr": 0.013273077865907586
+        },
+        "sciq": {
+            "acc": 0.904,
+            "acc_stderr": 0.009320454434783207,
+            "acc_norm": 0.908,
+            "acc_norm_stderr": 0.009144376393151127
+        },
+        "piqa": {
+            "acc": 0.720892274211099,
+            "acc_stderr": 0.010465657948498228,
+            "acc_norm": 0.720892274211099,
+            "acc_norm_stderr": 0.01046565794849823
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a39ace13db3937f20c6200d4e5e4d556cacddbd6
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5a1ec6d965c5e7db0c8990c64430992c627f6b2537eb8fa491e594970a652d
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b5dc0261d719a67e56b496bbb6237bc530480b1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aae328913ea53e8b8849fb237411adbc58ffb755dfa57215d772c0507ab88628
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f230128adf6c6020de332103d297349280348d63
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:534f9d7760b6b1f7814fe24ffb81c3310b4e9512aa2b1d10da5d4cc623f786f9
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..80d9a8de6b04af6b7f81a969618dd8348bce4ee1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d1687ffebbe58baee7cd3fb2e303b47bcabd51465944d3b7cc99644945f7a8b
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd4c4bed38a051e483106cb65f530a9bc0f0fffc
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcb95540fd8ee3a9d73dbbfa006712543c577008f6a083b805ed03edec59def4
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c578a732f37af4de46039b168d6ba5fcfd788cb2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5da6ceb19ae56f75fb7a32ea4bd86fa1b8e084d2cd7f8fb137326294fc8821e
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1cf91bb53cecbd5fd208d554e71fe3f2b6041ee7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c273e06cf83c07875dbbda3110141dc56a5b981346836df4345d92a3b4eb7a9
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69a2ea80d96c680743682cfc39d04221c8e9e779
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff1ca762f09c60e093c6e959194b574cec856335aad5a901732b5aeb71d4a567
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..90a61a6cdf153992fb3ac5fd7fceb2d001bc5a48
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f69a8e63cf861d4f4efe1ee59f9fb51c5b3969cd5d52db42c5c25a783978f582
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e1f311322e6529617d27d1f89788db6b76304303
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1326e648d77f9a6c199719201d8d7c48d1b1077f1213603685f46ed704387ae8
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8faf432292f9bb54d9ccb74cb8430d0894804003
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a1a911f9e2926fac0cd23adad602c63ebf55a4b0d4154a812f8f90faf2fb15e
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3986eb71f71f4c05bd491f59f55e9078e2d31b16
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b22f4f1b885b15813a2ea4d33b9f124e69e39395a116b49589d5f9f510e98f99
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cd7ecc633338fecdea1796ab187c033027e5e467
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:064d8c3d8bd39212971d51122a865e99f01395f0ad73519d7ccb35e4ebdc7723
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3ee6fe9af4e0ceb7076db509d844bcc236c1a5ee
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a74e3ddc8fb0806b3ceb3d766536cbb3878190511fc840b7206b69fb241042dc
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4137e694a66c92f0633c21f24fbb5cd8a7c85855
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07f54c1992926916376a49caa3694bfa36df96f4ded54e075c4e41f044e18aa1
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..59964d9567e030eb3d6ee2ccefc3cc55677de05b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1eb39216d95eff10c2828440091486afda5aa82c790572f80a663f96e5242c65
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..157e37649a700ffdf2268765fc2b3d7c4345cbaf
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41b4538f5c3967add2746d44cd879e38899cc5fee6e3009524be89363f70c1a7
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63eb34e898e8b5de01c21c1bf51330e7110d7184
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d0b15f22fe7831e5b0fe58dbcc7c4823e2de8fd18457d192abe8c99211e6be8
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..608a6d88601aa9921b04342581a8df55c5d8b7f5
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7207e6e753cee92fdedd50216f157010a5a2a0fe795845d8595b0e232936e71c
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..50608eedeb7e84511f389f3645973b2a0bc17586
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:626e1ffe76736a78445fb2d0a9919da7d3c5fa3847565ceb9eb7385c6edd868f
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..76050b931fcf45517eef86948410aa8af3fa6fe3
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cda412121806442540fcb6772bd3cbc0e3c97fc7ab5d85b5af64fddc1220a13
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..40655e4cd7a2a9eeb144ef44a4a7170bb30da505
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e7900c391e3b5deeb441a7a24717183aacab0249912d948bebe61a1288be0c4
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..551ecece8a8d5114309e13dab70c987257ae9436
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6256b1522e2d808bed5a83d03245c0c96012b217ab7d3b19b1249747059e4f47
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..989c37871a3d412a5f2b78bc0f126b14ae0f413f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c49ca5b7594f951f3f49e17fba644684f69f408dea5b919279981fb3ab41102
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fba916f26d77b2e10b6855e3d07a732e854b973
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3a49c72d7a236aef1c1dddf8ff0c9bdf5df905683973439d6fe26c8f381e039
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9a34d1825b914d5b1faf77701567faef94ead948
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d13526adf742125f7d926ada59764952aa089c374a70fef684714fe0cb4b25f9
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..38d182e502c8dcee9310f4027cd53d85c72b14b9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:029348371ef51e3d5098cad1383f973297d2864f8f2646c66e6770737b77ca6b
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a92bb6eb308c1c3bec584b07f4b73a088f0db2fe
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c1bef4e466b5cd2a46fa0a61f9fc71aa23ffdc008a65918e697c99f9740a3d6
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c47ce210c252590d7174571df1574148caef1b3e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2feb6732fb89e22baef9e5479f25dc8344bafa909d2076a810c372237ae478b9
+size 199058605
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..efbd2d545bcd320ef82dee1184086bd429a027d6
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:547f209d8e42b2e5dbbcb47339a9cfc3cdedb3573304af86f658de002732b79e
+size 199058605
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c8bc7c41d77be9e4490b99d9b7911678857090f5
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf2eff400d6d0a9069937aab22346714e9c6e684eacf8fce9787e9c82f78615d
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..39f67be1ab20bcfcda9485c4bf3f3d235a93b674
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b60255c205a1fa056c4005bacb6377e5f7dfb374c6dd0ff29540d55a85d1bef
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d8aab8bb357ccf2b7b26ea8ba5863cb7b6d3295
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f79f0f06e303dd147b12c004ee18a2fc5c7e5f91c2870c07311726932161a662
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5fd4210d5931cb212ca656a8d250a23c9c04177
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bd82a826982aae1effca6e4426b87a93a7d7c4a88f0c0fdcaa2570117f3ff6
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9dded183e048a88999b68cd6b27e047b4478e2c6
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b17c30307abcb3773454f536b026165bd32faf116c24470135443348a854bb5
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d581407b12f890988e2ef3e68eaf8a4125288933
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57cc98290c37ea06b45226ebb558b742793d73ff5cb993cb5473464b563204a7
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de29e4706e738a94aee210fa9551956057ea0b49
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dd05085924b9d86857ca73a33df89f849cc213211f48dd4f123a819742fe2eb
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..924a7fbcc46c34af4ba94001014def76dda2bbd5
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:801e7399b0624b6f680114014defe68772cd4a9d994d2e69dc79eae4247177c8
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4d324c7f39eadfaa699a3044e5ebb4a0f472dd51
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28d41027b38aac43a15f3ccc9e26f7fb7b711cdaf956969bf4e713d447344d3d
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a6a4b506fefb398c786e67491a3b4f4b0eb22cbe
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cb76854a814d589fae5eea80455769239e9bf49e2f6c4cc3910fa2837a975b4
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e168c84e79e4b50cfc5006a23b62e19cd9e157ad
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c19506ae4166eb4bb8c4972a30abe7938eb99c669d3ef01f210277a8b39fc5
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f46050759e0464dcbad0726eaa0d323d688c9a0a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d62d58b56db963f606bacb8f51655ad4ccfb71c1ccc2c462d7d863dd084e027
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..652c723b7f575a3f95ea356674717f602c0d9545
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8532956c732d55bab5422bef44e1c4767bf6f028ab9017000e03c6b5d6df447
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb344cc2a569271ef9f0a1c7debd65daccb5f092
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca27ab9d5e17a59dd627f2444b323d4788f73d2d0ac14e2df68d02b56a1ac95
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3cd2ae390d960cb1a289813893104f6415452757
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c182b89e23b5a5dbc4f5c601da11d95afcf8c64400c0473c01500b91e0a00b7f
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4280605e045d054431ec4ed370ccea7a576fc983
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ed0cb07cce37f730578bc341cfc93f6ed52d4bf92032fdfe95d6fb06dd38218
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c33348a3d7cb5236a00788c6c5d5bc05f41eea9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fc7b630e641a0bdea0379c99e66c3f921213910c79527aca2bd5eac0cc69544
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c17897128d20aaa170d80414361f1623e7a580c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81702c460697ac84099791eba33bcf5400a158f91b97e2d1eb18a42287be652a
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1c0027ca1661f01d515f3ed7c827e30e88c7cd9e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c06fb17fa4e68be9f2178381f259681fffa61b348dd4e039ab015513ab5b825f
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..143b2689e6a10fe53d67f2f694e592756c3a3b67
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b0f9b37d6b65216d3f0c8393e3679c12316d1bcb0cac2c37dce7e635deb8ba3
+size 199058797
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..72edb8390f08e8fb0c486bca6d4e1c1076ceb3c5
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f2e5b60f80e05d080d4346f70ad7fd3612878f0a886404f66eb3dfcd43415d5
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd349995e5331c9ae617907b1032eceb8b98b848
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ca51bd00ab1ca40a67cc005f099da5f4bb4b83e7a29a622514d76d6871e50e8
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c3806a30c8572371237fd0d5b5bdb156168f119
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:209e61bca3d9a32d6f743f2670384f80b4d32af47c3cfce73e6fd8495c5b1792
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af13c02f1e6b37a728d9d7865526be000e695ece
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20140f25a9ee78491d3efc9affd8aae0f2cfb2fc98e0f5d2f1fe8aecf888c4d8
+size 199058733
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ebf94f431c27090eafcec18c2db41951d80cefe
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e7dbd546cf2a90dc7ee9f0aadd4ff2d1650efa8c18197858f7a84d0fa106dc8
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0d4126551702f655cb11f0c6acde1b214f88300
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:717662f7b0fd4513fda54bdba71156fad4671f01940a987513fc73231a66bea1
+size 199058669
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8baa9d1beffeb878507abf5f20f2fe5305eb1d13
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc1746d7e748975339bc157f5efc1f54bc43cc21179aa2f10d6b1def954aabec
+size 199058925
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e6502c9ef81b4d80e88ff9fa55fb38eccec3429
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4064c0044b28181b2d35fa72fb3619f46dfadd2a99dfa8d33a25d13bcb8c5662
+size 199058925
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6e4ef5a0fa91e41c86dea05126b16a400b09738
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b691a4540dd49eaa1a87db3a6c4e03e9e938cae5ccb60b19c3ea1e12205b236e
+size 199058605
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cfe827bcafc708c4f471a18d25df145c047a9168
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c36ee467ecdc1ba639a2c184772cf3da3b7244f4acb35782d65cca951adab18
+size 199058605
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f8f23c0192d5126296a3aa6927074832971c1db
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8604a234555a69c901640a420c5b9261b2e3ff6d3a46145aac8391d7f6bd6c5b
+size 199058605
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1ca852f7efe5d572e11dadbb276eaf230c83b404
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:474a83a2c252663ebb24788edbd03768ff43f008c0f22bc87487a8a68aa31059
+size 199058605
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..35c63e028cacea9f879ef73bd0d563ab625b2e07
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:133309fea5cf43f74f9e4bb1b1245ec7322a95a8625ad226b8147b3ac95bc4d1
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69dbcddad75fd85752029ebff0e5fe998bba549f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8f17951ce823604faea6f7e1fc44d6de21561099a140b905515f49980b7026d
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c54db23b715d3b0da8a681e9613d5dd0ab57b85c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:421394f2f39c62ca236dc6d91dba4ba4908cb563155b7748fa253d4b5a777efa
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f00e34b9696de9e089bf026ddcb35363ce5e7aac
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d50d296061f3ce6a4b6b4f7d43f0e7e1cb6219d3719d9d4d9fe499a83bdcf9a1
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e7c92df09679221504bb5811ec73d21b8a0ca8a3
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4966482fe43c09d9f2c71d788f76f3958c53e3b1b2580d433fc2ec77ac1c745
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..845da3b64433912bf7e79911f5f2ece614a3240e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:789b514f36fbb68ade82491df3fa3c71c20dd71114ba358da75d16b8818eb21f
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f60a5e6c869c6540004887bfed1c3efae2b9db1f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ff78b787d879e5c3697351b05a0e5036a7cf722388d84d26dfcd32460e3155d
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a3b4653509f5b0084fdde715fb75aeed93adc9cd
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbe145846ee853c2680de2292bab01f6ae99542258d63c56d039c48afe2af519
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4810d8b33a7a360e475f468914e42e7c669bc4d1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd63435a260c5f84c42679bf456e20830536c9151d877f001874040c668c9e04
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..707b96f5e93ce52399939e3a51c0f042c86ba600
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a5116a7002d26290defee829a98a5a3e63ce39c0ab08bf2dd27cfec210fcd74
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..72809175c860711ad503964ef44d9c4288ad29ef
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cda0b15f09afb4272eace7ac09942a2dd6bea54c2874b5333b2509073ef3c35
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db3a698e76a5aab0921bc1a3dbbeb25cfb6e9a7f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f29c880485daaa1eb72e03a49fe92bcc1c7344bec09388f0edc146126cd9f8dd
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cb44f0676cbb77b43595dd51053a5148777dcb1c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:624900480eea2852815f96188a16423dfd0158bfa8f2d265b3e52baeb170bab0
+size 199058978
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fdde19490550faa480b55b4ab88e5b53b683c85
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbb5418cc494532403818da9c1b492da0a35be9028c9ff35de75a0e53221b399
+size 199058978
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..23243d369f8e76a22065afb419618211809a3e78
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d1081645dda4b8bd1f74889460cbbeb59d68583e4254b57501c91600fb5e21d
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63a3d8e0dca520544caef5f0e2f52a50a0c39838
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:389e926103a2fa4be16877fb47f0f13c7a41af877da23584c1efc909fa3c97a4
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b495a8a8396c9e969da74adbc4381c6cd94053b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81a52926af08b7fa14bc340bfe36ffee824a07066b3cac0db36ea74b801e4b63
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3be8f7a288f5554061fa0fc9c4ee8356f82a264
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:626ac609e87b8dfcd813068c4592012ccb7a68bcd11d9712543846b0aec562fd
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..baffe6e64f508173865431f9801f85321e6311d1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ad8b8a776bf5371f857cfe46a07fb02194f923f913035c416016577150c05cc
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..58d0988a0968824aa24f98798df6f62d59cc6bfa
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfef0e43cf66490cf0e51769a6e34bcd14f47f9986e690b26817a6c6b39d3d2a
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3083e2365b1826beff47bc913c6955ec50ef525f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:593ac7f83354ed0bc14baea567b36ac01d20869e1fbe03990304e29477e3a805
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3f49a841a2c3811a0bbd9233952a6966810797c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2da4462d6bd766b5a832ef422b14759493c0088051504362e518936f6a26c490
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..832a55ea65ef3cd19fdc8b8c63d861780621cdf2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e99aac37abb2d3f536f445ff8830ab6e268de6cb81ff0ee781336d194c44f767
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3080d25b74c44c4c35e34323383c8509a79efb18
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d6a9e93b0ec42f12e00b224119f92d4ad4ab0dc45e004a55fa7dcf187e999b3
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..086946d54458c3ed5f259d5cf38cc9c1044d0deb
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe0c9bd76d73668ac5d6e58a90169c8c0062f9acd2e4f73b1f83a5a3be54ab6e
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..547a9482a8f054c5988768e9c6012201ba015bd4
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb8112525dd5348c6ad4b2be5093b2274c068f4fe4930b16f65cfa05cbd1a2d7
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e7928645759b743b134a78dc63a396c92304d7d2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:665927eb5a486a68661645354fea532aad04ee609b647c7dc42cc67c630b4485
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d7867af9158f3608c94607dbf5b6a19095f3b01f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8aebad9d61d470fe33314931adb0e23cff67651ac5c1653ee50073e3f72acf7
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00fedbe32e876a6ed137b15c54fb2e94d7bf13e6
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ae525e9b4000580c56da579e890212dd2d803ae820a8197bd7d0429c32cfc7c
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d1eced1849092f8269f16ba53f6d08ee3c955576
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c5d5a1a5e18ce10cc9af21ea83bb60e28042164204b1668e3fc4f9a66915145
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e9dd2836981124be47c28b5a70789d3c2a2b89ca
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf402c86eb10b51bd3dcca8641e4876b0ca6ba7e8fcaeb05bd88a99dd750cbd8
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9fa5973476c8572426063063ceb1c652401009aa
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c6397cdea22aee1cebf66938b75893b5a3ec31cbc014bbdd2bf4ed9fe2ff7b6
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10e5b334c694f2d9efe47e2991323c718b0f11f2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9c9bd1c1cbd56dfa8708163c093491ca760d4bb3d4984434b63b027c74f98a
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..86b25cf3bd5585f0c50838de9b49aaade363a433
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b32d35db3d2d8cde47f0da67edb451305d7e65472e9b379a9ed53ac4d21f038
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..31530a7c6b36d3e1ddcb4fbc8e1f30de2e4ecea3
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8b18ed2fd94ba0cdd82c767bde50cb49f460cb9e6be688c4483ec3a1f17a2a2
+size 199058594
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ff7ae2d1e322fd112290f82c7e9060906dcee799
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d8fc28a17fa0505b86f12717f84154da3b918ffc2a20a5a400dc5d52269b154
+size 199058594
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..94ca00553b93612e740b775a4250220df9cd5b21
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1005d0f0380308b1a397812ded48ebf56fac632cf1e86f9486ef1f7ed68513a4
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a09e544f14e1199bf7b50c8b8cb4f77dc49d3bd0
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f6c5c2f321bd82303a052c5f47ac4ac086680459ea721db24728e6422941733
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e7cb70765cb594dfa6078c29a83827f4c2a45f7e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:562d61040edd0fdc19e6a1a5a8ff2db258283fa91082edbea2580edf59279e86
+size 199058711
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88b914f58c28c0313f60c990260d7c80580649ff
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a276543c0f0f6774643abbd8ab85f22ab43cb91bd1bc37d1ade7ff4279be8c7f
+size 199058711
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..893f5a6efcc29d560a719df3110e5518903fd6fc
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2c2c3aa639fdd62e324be64284ec32160b05c9b4a455de146d6476a88bc4ad4
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b55eb09613c0000eefd7f0096e1f50fe7795161
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f0aaa14136abbdeac377ac0b895d669609c8788baa0f04d97f9cb18f76e8aa9
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c0756c5eb50bd6b7603c66d0280de56cc1e4b22e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:172ec1558b523f9ce2c9bcc93757fb22344f38e87cbc0104194184d78928368e
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8ce67bd4a9644175c5ddbf2a74fafe512a432cc7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1108b6427cf5a56db4130a78c803adc8aebbf248e7c068ecad4de6fce1d76fcb
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cebbb58a9311c7c8ff3fce1af05fba615eb14247
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b14f075aea82ab60fd707427f90e9d08af122aa15095cbe313ec0eff34b88122
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..259be1efc23d6f07921b88f3330c003e077a440e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32f111c8fb6ffd47fe9e6fbee0d61f31cd75e79f4c07b29ec0db0215607e9eea
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ee82a1cdda74d2d1608a10cab05bdbe6a8843075
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7993e24182f7f68bac20d8407c313930c4101386af65399e40a0537bd47bc20b
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e75066e32dea71425d80d89e9e4aa3359e3f0d2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51e3f5b5a57a3ee45bac443671cbe80e90149bd8f735d66e276f5ac24c73990d
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aa36cb7aa7ebe340e2d8c73e61a0d585bc4e660e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd772005dc41917c04fb92469c8090db936832863ef8b410441ea363e92e9b83
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8db1b0dbf8f86fdfa9d058f973a235b2153c6f99
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a36ab108843e452d735ae8080355962115920e2d3d25463fb069cfa99c3dd19c
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c828f2f2bbf931560b7057bd147d97a9679f14b3
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a26ca94f9e1385fc8bb06b155404e31b64c0f101ac1d8e43412a1051acdca0b3
+size 199058594
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00c7b557bc62e1944b2963e2ebad4924e37aa53b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e664e5e28de0503704932f19b9993a0424dbe527156d5c0adc74d8ac715180d1
+size 199058594
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b39ed3bc308821fd37d36ee9f0e5e581ff96f9ec
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5bb426d694bd9ee7b266fa5b5fe5eda19f32f5889fd0c07c1707251760bbb1f
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0701299ee92af1b478c0b7cdff227f03bb86ef5f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb13813f4685cd32d6221d52797d7204c23d31a5f5294533f86b648db8de984a
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f9cdc7176307d4b55ea7dc30233de34a06da7774
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2ece86790540a747d5f9999059e1cfe9ef9f611b15de5cb73c42e75c7f31a46
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e266d11a40aa6ea20542f7d9dc7e665332d0574
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdd070f2774db61a11471b4a0d9b2ac6e0415f7d88b40048ee0749ab8e3c1e95
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..53cedc7b56f69e49c1d34aad877c975760ca47c4
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a77794f6c92819769bb8348d15a85c4812e456ccc4e1306b2e7d17eb9f27351
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1ed5e84d2a60293f62c5fc0c2e26f77b00aeb24f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:566e6b828eb70310ef4289c3df764587e8351f7c2114df198cba5ed28c49eb44
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2731954d0a19da8f2c27f801010c33238e2eccc5
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ebbd6aedc1b7096441a9328eaee4135fbac206a08d7a0b7d129785ac033cea5
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a85a4eb281ecdad537a22174ac118d750974a364
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c862ea2ceb21688a5d71cae7d82965d26124a993941f0e55075cb2fe4ca41a1e
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b2426190e985e4f241c06769cf8a0a7ab8df159
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac28a1fea5591200fa9ca2f514eac53818401c05ecce3d8f335ac6d507873524
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6e8615a80b83bb4ee0eccf22be982485f7d3923b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac3035f9237cfb544156c398ca0519d2d93a349f7d032432dc09a27333b1ee26
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9151302bcf7039084067714957d7728e7beae785
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3542293414bd68f97bbe8e34c927e0771ab4b63b5676efc95f149490ff88e30
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc30a6d125da97b48cdb56605e32a8380f742c33
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ca2f2bf18caab86b176e415bca7854241bdbda818db50126be1250deab7e264
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..887b7bbe5f63f7417b28041be8b57680a7b195e7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62baf9993da8922d8ccaecc78c80d975d69d23578b3e80a8f08fd43ad31bc7e2
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5a3c953b58fd0b9189ccdcdbae1e74e42032dc7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19ca20cf1636908b54108ff2509523e18c76d2942968fab063924c8e214020fe
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..98f424792781900461e3e753e33761a6a5de7926
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3ad6e0e60df6327b03eafccd8b98d45a054e511607763647260a26a92b4d32e
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..851c640b43b9da0732c0a807fbaed1af1d87adec
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00a3737d497707c1a323bb03311dc3af0ec60bb254e56981ee898ec903ef670d
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..15774d8f6b7839689f8f069972b428fd5d5f5a32
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e95b9f710777a5ef6f33b5f102434a4dd35fe9bfc229f359845a89e3403e9d9d
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..866ae33ce5da4e57cb715f858c2b020155704010
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85dcfead9b883b3e8f4709f2bda635f851d183641add3f2524b1142ed37ee323
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..32cb027afcd43279e58d94562113bf2a530ac915
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec1f404a715071f088253ccf4ad92c2a2bc7de8a7b5bbe8a1762047ea33a8ad5
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..975fa3fa6010322bd42d6e81f84d4f17808fe66c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:893cd4364777d979a88f6968ebc733b9866ec4d9fe086d33f99d6aa13368dd51
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c150288956e228876350108b2f832c313c5b19f7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea95f54283c34befcc1874ffad67b74208c154dc2b3c16b728a2879dd3e3a9d
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f13440f95f08edb199f4142ea812b4e8b3a44f92
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:888d871712ea0e79cdc0f2b98c3cf4a93ae112dbf1602ae78fcf1742ee07b7bd
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f2c0a5eba1c6d44c7a59b34d6ddc8bde155ac60d
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4848ac39d32728fb11eba9da5290fb9cf79c081df5834a4d36690dbcae76a89f
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd9fd92e3c12fc67e016300b562ae2343b87c47f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:164870e6ffc594dd2546d26d24a736ba13161dfd454e4f168de6f62d009309d0
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4b6e8b6abfc023acff7dbf59e404de15f3897782
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a7c45c3ea32f22ce14ef84f2cb1e84aefd4c7febe0566e52978fef8b1e3cac5
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..37255607b6b4bb7340a205de292a5abe77890852
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e0b58471aaea747712af1cd7e9902345cabbae9c06d26c336dc199e47a274d2
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ebc112a1804042d44d9c357c6122733c938bc715
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2cadf945081b3655fca1a799e52a4af84420b1ba50c5354b9ca1fba9a57ee06
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0ff4f7b05a062d47c3455dda1fc78134e0e223d2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11fa87c174333b21a0c09f2664c766b70b410fe305428adab72e7fb4bf13d49a
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dba4788a15bfc5b55e36d5ecdeca3abf2c4ed40e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcbce4c3f78ecc43c2a3207f13768d657ae6f18bda5c57b8606d5c8c92c1593c
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cdccd0992fa3a7fdf29bf49738d5f911dd6a4a8e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14e5e28721cce6a651f415136a8f68131a0c02547c7534146ee660ce1c8ec1c2
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..54119e9fea3e810eb6122be31f902433c93f39ab
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b676d528dae8e89c71b4a3672410bea7755ca137fe416593e164e81e703cbb53
+size 199058775
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d1650e3a7679d4efe123ca1df972f57ce0f5108e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2418990174ce61db1ef6568b7e16811ec47dfd94ac8b405040f4cb93a45380f
+size 199058775
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3d97225c7562c4108eab638e8b0f7b927fe3369
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e14cd49c5c56f57d5ca1fd09c9a91525d4e27d5d8d98ca7d3affcf8092f3218c
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..181e2c41db2266daa929e7d4455827a84381f5e8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64927223dc93353d12013373aa5a304473c2d03c661e2cb80cecf80a11fb0851
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cd30ad691c24de89272baf08f16372fe67244309
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95ec95305130a69ea82a78abf5196b3f9b75fb65f08624712194b952464ed7fb
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ee9aedf0491e8f50c97253c8c5da7dc7066a7cd9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:844e95d373b042e5976d654ee3dd5976cab697376108f8c8a03f3bfd81776c5b
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..245faea08041c39c14a4201bbc28bf7f4e291705
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff5d8fabe49d20b8fe61b8052e1acf74c59da6a9aa620c0041fd96c428cb3b2c
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..99083ef79f8cb9cc075c753fb75372607727b86f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:683945d59a89591c78906dc335d0116634f03121b15c1995392860f325d14969
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5e44866fdf18afadc6e254999bd76e4738868f2f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07ced41d87d23b240d8b7b4b2b8a29e837bb02db3dd42a45a3b66bc3a7cb8941
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ae703fb24716d85068cc9b94e10333709078c5f4
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0072fd216ba29d1c1b9711acee40b82db412e5052e2c104d3f24f552e84c15e6
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8b9a699bd3dd40d0cae751e0e22d1c1689d1328e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ac5614874ed9c7ed6c2ff39c4322a73a9c7ec256556dc4211316d7c87a74d2d
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2838ef3e430d6ad7ccb919af4055d6472ac65afc
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be17b97785e70bf25fcf961e6d4c369caff04f00772f9ac43088fb75d6ceca21
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d819e1b5273f59fefaf6be78c24baafd99eee37c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7aca6b3b0ba4a33d5898161a543886111d3ad7bf985c5dd88ec32941266568ab
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5e982f561ae738af1285725f76dbfbd29dccb5bb
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f06d43b1b4a2eb23c16956128e53d4fb1318ad439857bdc7e3e7afa78af3a21
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcec6cebca85ce56337703280cab458ea4a07f91
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de76beca61cd9c59c25b2dfc069fcfb35b7b2a1117b8f5ab9df3c2c6daa85c11
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b0fa376cf4fe9436d91b9177909ef8d6159c8497
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5eb0445c41241f73ca2df1cd632653fc402ab5c299a7b1895361370332e5f00
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b262021da8c428127b0b0f9d93ad2778ad682082
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0ac76aa4f8e6adc3ef461e8e8cd0ac56a40888b746b718cd5eb7de9f848c593
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0436edf56cfbc49fdc62d2f2bd06519f0c2e142f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:034386bf7c5a348b8f4401e276cd81f83cb461e6338473e1ab7938105cc06936
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a157df07652496974ceb4544eeb8100d575386a3
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6278c0f2d4f0cf8812fc75e55c2111daec41c1a1b0f16e54c488fb8633f77d4f
+size 199058914
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6821bb3daa0be0560112656dae02f796ae26e3e8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93349a8cbbc02a3a7a6b89e9ebcf3428b28a883188ecde1a6d408e725e8b834d
+size 199058914
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..364bd08c158c9727f20cf46d6b149cf3b82db1e8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90bfca4372a40ed9a9a88df070a502f14ae220e56b9fd66d5f1e8b1cfdb5e015
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4dc6e7f507c1923df63fefcf59633af2d72b2745
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f04813a54e75ee7984bf3fc40ae54d39f76aaac9491f8adf479fdb3797c496b
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..725f17ff7cd6fa71c0ad4c2137332f504dccdfa6
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90c072dc10ad541d6fc0194645ed9f48ae1a0e39e8722c78027fdbd13a15d854
+size 199058711
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4686487d732c5e154fa0f1ac586d197ee09b7ace
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47d4929251e844f53108837b33aaed50dea93b55079621f3d24c8d95e4f5c682
+size 199058711
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11309751704b57c910accc579aade3654cd402a9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9f71ba937705da3c1146b4319929a98e46286240d7e6791ca2e36a7db9e8bb7
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..38e2cf9c095427db3fe3fdda7acf46856768b7a7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8886646be5d076419930f2da6c6d45c221befb804d48614940c0ecd47a0c735c
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a754e24fb5acbe1eaea5c57331b4c1322dd77c28
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e79ab35cbd9e26a402084c2c340ea5be65d4c931e79df8e04cd17a33c7df43dc
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6c42b759655be88b3a2168df82d56416fc7b102
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:697e224b98bd113cb1b98bd7a611c85caf6aecdd19d837d20a54b25ecbec77a8
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..90bcc418385b189d224933114fa8c08c915054b8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:554ce1ef80eb3f8eaa8bbc1fc2650e59c111865791cd4bede444febefda60625
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fc9acf19e158a944398c051dce54ed3b9ed2b893
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94951087a00d7320dd3add28b00c63f340999da7cd71ed00353007ff01d7b364
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e810c4f7b1d88ea5795bef12a314ee0fa1c702a0
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9862c910bf8f42ea784b22a040f673bf111696f90d6c91e4b59205be6a21d465
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0810de9cd887d91245b0b6ddae2e193a06023e99
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb4a0dac05c1535f260ea0d8fc5660a53437c029bcbc79ec21cc0238f424671
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1c20ac4313f15dc21dbfa751558950451f9fe837
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7087593a9ed19def6929b487f3e85bee17920dcb9d189fd3d270d70a0a1e971
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3307230c2ba22b54b018f27cdb3fc630dc54092f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe95a46a6cc306db6046a5e7e5dce6a14fa710fc22abc8ee7c19ee729bbeb18
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d9f9417841c0785e1c02b07e6ae78698b74163e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:042094f518b71f285f597f2931b9ef252993cd1f1d4fe622a44ad733f515a5de
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5466aaa180042d0058c69f0df9e3e1c676777e34
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d2144e46be4d7903a177de1e5e494b9001582ae39db30b49b264ee9425e5af8
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..007c279dcd0fff6905007009eb9592c38482747d
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cbafd621c5306ef06dbf2749a9b9c33f16fa4607c2c2df140c8464acf6d53de
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f37733f4fe949b7cc3526039ed1e2fc626fd28eb
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8570485da46a3ae3291c8b8f78d26e1e6f26267d5826399478393efcba884702
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3b91f760761b2f97172b5d30c7cb2c3ac2484e0
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24a903a2b83afdb268d766171ac69202fb3d3852b04e816ef9325ad3942015f7
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2e615675d278ff6bc177f6340b0f82a2e8f3ed9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63a5f973f7c5701a765364645e905d391a345afcf667ac8471d5c1c69301b4dc
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..049adb090ec1ad304be55db9a4517d0c2fc6b9fd
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cec72b3634594b962107cb631e655f384440bcec02b113e5dea44882bb40303
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f8273fdb5426b31390adbfceaf18f8bf4957958f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a653226fe3730e729e9b42b0ca9f8244ec53859cffa9e80cf81b776185fb5c8
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81b71737745c70a5fe14f5c94a73f2c93e18a7a1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38b1d5394ef4f0b21e22ab8ec2138720b4725d98ad2dab9b1d5db40df4699c10
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ccb75be06ea9cc32b3632062f8d06a95dc3a1db0
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2c1fbce3d47a9bc1cc400dbf40e2d4f1af2ba66dc727a15142183e25db50863
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ee493c4b53f5f72d2869023d9c5a0c1e10e74133
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a353bcdbddd89588b4cbe9efc9aff9824679c3a204697b531be437191d775a0
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a617f23d69b19c76e7ddf8e844053460ca98426a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9b82944b120f99b588327f175d51c04ed2ec757b9cdab91c808274895d553f3
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b22bc93a3ed0b9f9e2eb8a75bc47ed55502813f3
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8be92d68b06f273f48ddb68fa6772844b4a839e748c0151b4b3079996b1bf656
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ced16b405d1f9f4e6732ced083a061e41ee272b8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40f239aa8604e6d6b862f20ce77aa02941cb7ca7c79e3da76a25a7991382ce08
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..726303221c84f2c5fb53bef650301f05533bc8c7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f205b0285dbc580badc7c6e476fccc2ae097d71da078cb110fe7da766a81595a
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8675e383663bdd2cbe884559d996165ac6b6845a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3316f8fdeab9a5ddcc088c4e80c10eed659f13bd52a08874b7e475b2107d779
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e4f17e83a8dc8ae60d244c25d364ab70b87021ff
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83410883aa48a24c2ce31bde0e431a7d591b9d1fe1667f2cd640cd3ba2c5f827
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22d44e69ce76c44e413ccc745466966df90578b0
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dcddccd9a4590a56025b7de1fc1b5d8e980b5d0b8c1c4e170ca1671b69bacb1
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4960e1b3d9c8abc1da3a99fd521b1f2e44189be5
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:299d43fe8bd69bb49d26cc14151f495a37b7caa341d41e4d1b1b8e9ce3020ebd
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..72c8170a14abe7bd0076a83525f233b16f85027d
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:858f6205ffe539977e3d33e3da198b644c1f5aefa5ac259685e0455dd31d22ba
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe8b3c4dc5fc0bbca158d6c899159ecb18d75185
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:393fd9b0cd56be64a4962c79f410f95f91b95ccc26cda9997f781c9e6fc37efe
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5f5401d62d41fc1da2a6b38aeabe7801bf613b3c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a75b78a5078128898349cbaf7c8c9538ec4b409874fce5ec1620a52173aed758
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6afface082d9dd96453575150dc4ecb8715d286
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:653c6cda427f10d0bbfe96099a6ed21beba3e69f1e8841a761a8f15fd155a781
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b2b6081df2b073269c73ff901a157695d0c3d90
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be9e6c03b273b7dc1e4fb8c7408ef5a885c57bb1b89924f416850b8dbde95b36
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5f1974d1422c474acdc1ea6565d0761d74ef40e0
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de4a232a8dd29efd3597667974335a0d6de36d68f13aa41705c6e4192ea7e8a8
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..34779f0a3b7ea379baa71ceba79e36cd3b735061
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0661fa85710ab2c6931908e516fc154b17b732708cfea07473dd8638a0371cca
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea51c5ad1f9ab7f3691f7bb9c5bc31617a832a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55aab15f105073a38ddba1f01f549fb5d5f8aae6fe22f75f9b20bffe1e2369a1
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f20961a0dc118cbd069b45f5faa14c8a3d21c23
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0b8b72d4f38601e4d096705af0e93bfe785c4a8483f667829e4e56246f4cfa3
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a05ac5c5df03d90b4831bcc7ca3dbd6b806d80bb
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4719504d3093eefdbcbaa3157dee8f2c20a7fa521e3524431dbd53c33314bfbd
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3fa814ed8e99c25521b851e52df57e50d7f943bd
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7510c7a97b7a66119e67e3ce56e057b1cffe469f7c89092c20f03e959bc34c45
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21526dbba6f934eaeb7f7bec419b2d50b178ae16
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64d6cedd1848419e7287ba1372b104aa1d2a69f32cb3a4fe5079c48d0acb0736
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fee958ce1fb1b694e27230218f0185cb35690290
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0245eb47d364572c8f790a27592cc3579b0f630a75e47beab439541b6e6ab324
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7afc490494814b6616701de45df3a3dfea18e10a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38da70ea21ac713c154705ea30d1ab6331b5197a28e0829faaf273cb74a81872
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..289416e3cd450a8db4123f6e9135120d35703b30
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9e7313e38c6a9fe0d234986f964e4461c84c2cc60ee9efaad6cf944f2b02dd6
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..085ed9092fcc47db943a7fc3c193f90d47b169bb
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3122f0261f142a538c25ac91a2f7b6f1a0a272742991639275617ce3aa1c02f6
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68221ee5127cef60a02853039dd23edeedb4b6cf
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7679e0f85c39b5aca9d8e3a0b8affbd4fdd9615b452143be3b4fce29db9ee43
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a152523a40b2251b4c7d7110e641b2bb0eefa344
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c12ec27d695931eb82c42d084f6f38e9f0bdc013954ebc560301417a48d248a5
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d8d74e9085854f50eef5b28f5b8624904de1e8e1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d662adab0f284e66fde27774eaf7eb271f096d9e62adf9b52cb088573f0da1a
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..494144f31b801ca1be88f3940df66cb920cefe3a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae40046022711bd19b718bb1c02aee2a8cb51256588c6d8a6174d5c8c4896d74
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3383696c3dd6c3bbbfdea763f71be754f3c449c8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4908d68bc30f88a8a6cee7200f39a224d79c523a8ddf4f1a0e5a0cbfc0b55f
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b1bc979eb8e111eb523a758ed12b5668a1b7dd1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7668415c035796d43171e2398abdca250d1ab04528b2d93a9643563f7c38a2b0
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4876e4cb5da4a0bdf9759a3ef97d2cd1a28ca479
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a6e134b98df3c9f82830250d1f631cded0ab53da51461a00f8ad71d23827b84
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..172d096b39580c4d9d287211dfdde0ccc963b585
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7d3751cbd9af19b3511310a3a5221a6fb116c1391df8eb9e28b0f2e30810713
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e590183cb0cf63a842960d76adf60e6f7722132
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57647489e72a572e6dd0b411bd702b621be17f2e9c9746c04918089d8972c2ec
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c59b0d271ee4afae664e89939147767c51e52feb
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78363007449de3a09ffec453dcc3e6054cef5506bb41e48a484b4c1573740254
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1b4f295d1e57d45b5c94cb062e26c821d5475998
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5297aff022a7d2b18396854be0ff75d9b3ed21c7cb05367cbd05c28c68f03535
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1cd0c5aeef2c61acecd89edd5c64a5c1b3848a86
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:159aaec07b0700fff279477834e273e4caa6024d9cc7a5a811e05c9342662124
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ba43b9efe42b36bcedb9fc769bb704140f12e679
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd860efceaef4a8b595a15d82c15f0f4b09e3810bac2cd74fd75bc9191a7ca2a
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d0158743c297a3e770f224ec4d16a79bd0852e8e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30b094b6ea7854a35130169e2cfdf99f22ee0e2dbb05929e8193752f8d9a264b
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b06bccfd8e7e2043412d97af6fa2005cdbfedc71
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac71a25614eb873241c2b787501642b2a8731576847a259e55d9027d7c4c21b3
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e369734e39b28419e36aefbd4a343a79103bcef2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccd2aabd7ad83ef540a1e00880a0729165f94b60646ed97ecb429745cdf8ab9a
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..942788fc6f1db0ce44374389afe54e1cbda896e8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df62058638fce931fa1474f8e420da8dc9eee564a73b9268975f001cb6a2386c
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7935bcdbdbe31070ca8cb7dbb151d4981a51b885
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b8ca2645d3a21e802ed5f2b12f092bd387313f6ad8b240773f0f5cacd234f6c
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a34142988f0adaaada99c046ff3642f3fd62d241
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:433e17b7e84d09b82e8afbbba8072d3bb69beb256d8a6aaf6ec00905ec3b9119
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..16c461a29858ca2815bf6cfedf4e5ade0ee70585
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc4c5be64f8276d802da786428ecab952e5264bff30ef624b4fe5f5ba711156d
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65ab6d9527908195bf2ddca70bdd055403c75de7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:238f4c4788736f13a9df3915a15c8a3a9802492bf5ed65cbb4085146c82b0ba7
+size 199058647
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63470133bf16c76b7d9b4b91f6d467199b593f08
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4fafb4141cc21b263bd4b8e3b3959c5303801aae63de6524f14a1fa9a6fb670
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5e0e04879c2db76cef89110cda7a61ab717d1472
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:904932098539e50644a69907427b6768564b090fc82ecc639531398a0cf8eea5
+size 199058850
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2014d928ac02b1520a66e9a53d24bdcae4703dbd
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:954e34a11e185654fd73bbb792946afd1882a93c75e7ec509a2ff7080e9e6c4f
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8a9d919691a20c669f1ad543ba853c3fd69a0f36
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7033fb9d8bc6418c0ff0dcfa7f91d98ea22d6d3e1d0d9e80fe874de6968512e4
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..820ea7cb97531426143e24fc208b6ac968082031
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0e0160f3bd07365cc837d1ec1d36a96c72707e5db3b16129da96992c2b98c88
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65a06e3c1d6891e4675e4cc8ad018a5042254e81
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ae381d4dcb93faa9065dce73656d896c1d4d61810f0a6ca63f701213171165e
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e2dc150b782a9287fbf0cec27b7c8a8d9c2702b3
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73fd860c7816dbf1d07ad41b3b8aee2c6561109ddcb8d883c8be3faba1d93c55
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de28cebd66f7dc409c08db3f5a29f936e9745bde
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbe66132f645f78e48f9346609c9a89ef5b7636244f8ea0afe87a35f4d07dc60
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e4ca261869bf46b013ca810989a6736b8da2a9e6
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:242c6c951f5f0ea9d0ea328ef4e564e19475b618ceaee22b637e5481569e1f55
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..667111b5235b85fc33cf7f5738d1d3f35371aa93
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77a75ea12fac21b1e8e8b7f412d86c4be7676752a860fc51c05cf1fdf4555706
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..39adb08b05e7f2462b42f34376989b7479b35a4f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:351c4d50c6cd3cdd671dc4c14c6cab51690270e606866ec89e6b52c76806621d
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5eff988a44dfac712bf17606f8f7f513d8799de1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9eb612b6c802553aea59b5b351a3aabdd983a590f49b81272a1c24e26586edd
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..95e4f67ac0dfc93b776b52d79bdecb52354e65a4
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc85403b959b0f10c9dda68fe444739cbb6a5c8c9892226b4cd2f7f722d1db7
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea59a6466635a92463e1a6e131674e8349ede35f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d579f6f5173b5fd6cbbcc1882a255d39e1f74939b44bc04cda6c0703147946
+size 199058722
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5a83e32396fcf0ad37c8432d08e7f266c6fa06a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91c86a12d9f836a235858d485df26e2ce9cc0c1cc1799384282261a8f5081baf
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9592ac45d540e840fd1fae76098b8b7d2848f299
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18f474cbe832500af82902367ce5b6d561c51dc9389e53e3485598d9f6b09f23
+size 199058786
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..948d97c490894f84c33ffdc3e835ebb4b9779425
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93d23203876338335c9244b3f84d1e0fab28676ebc080a18b32126131514ddfe
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..38b32e2f7c60027a536220e0d7cf9158fc7eee7c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5097bd4260c6fc22b3826d1950b1f74c00b4a21aa7818bae1f9e1b435d3ec1ee
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d9cc2bc044d13306c3c4556448e1bbf4439805a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b95dad8fdd2d8c8ab60026aca258d61fcb38c10cf48c8f04d1388dc7237e3afb
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8be188c9815fe55e53cd174809bf50917c906c88
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e8e67c58e111f6409390f18a17964138f0184fa4312c33fe1a99f7b06e3e602
+size 199058658
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ba7b83e412a9ed84653d41783f04cdabc3176aab
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:532abc41e086794966f330c7a12cee0cb8b2e6add6fe651d64c2d8bb92405e8c
+size 199058839
diff --git a/4b284b12boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..03cc20ad1b6e59b01513d3fe94929351a1c2f3b9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3022b2647f155e7f43b9a22f7318e4b799a89da3e1fd01c170d0c1ac195519c0
+size 199058839
diff --git a/4b284b12boscar/global_step80108/layer_01-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_01-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb23a055283ca866836875d0895b6b8165f6f230
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_01-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16ee43df1d6a64e3743a2c276cf29f9a7daf29ad19e3032e2a8d48af45788336
+size 167511299
diff --git a/4b284b12boscar/global_step80108/layer_01-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_01-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b867cd3132602e8e6dcf194a22ba3dbd12745ea8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_01-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9aaaf47b8cb8e149a62e04c125fdc718f0d164c98b97096eef822861132518dc
+size 167511299
diff --git a/4b284b12boscar/global_step80108/layer_03-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_03-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1bec65e04674f5f3ad59afafde4a41e7bf3fd3f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_03-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb5e2d94d33bebda8c60254bc65f2409d538a1eaf7a58023c6f008c8c8626dd
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_03-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_03-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..54830bc0251c7588cd351c822843080169b934af
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_03-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d482ae888026cba2115f605077c559b016c6aa1ea88434396beaf5611a716ffc
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_04-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_04-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..953b372b0d7562461ad42b724542393ad8068b5b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_04-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:630c063120708c0513976b6606c15d759d7ccc7e8062f988d111d050d4c68b79
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_04-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_04-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..169bac32af25110a7f7a32caf7e31300127b0d47
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_04-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60556453762c5ec252732369e47f080e2e6429b7fe4e0bbc230ea8049544b8c6
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_05-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_05-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01355898ee451d1b54385063a216645f4b90345b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_05-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98827c6a34e42279b5a24bd704bac60c8cd715c62a6db3d36ba12bd821f15525
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_05-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_05-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a7de8755fb735df35a2647febd207b793e51a6e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_05-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddf71061dc0ba5793db4bbec2efbba5784827ae83bd84f2a63c367c0d7438d68
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_06-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_06-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..24216f387c7f2f00f55031410d3badc2ab2c1cff
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_06-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f3ae4ded884747b121db1d5e254c4d58e1c8bf677d6ae65bd81495f10288b03
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_06-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_06-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1d8a9b6e9f04478ce14ad008aa9c815d50d3b3e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_06-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8eee9ae74cd4cbf92ed77c2f4ceff9fe6648b9e3b6a166cce704e3280a4dbefa
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_07-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_07-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a9cc07a26a12f7c1a5eded0602e494ba589c7a9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_07-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b996f250196c46a93846517f3a7a2d47a8998e7de091060a88d7fe4856c06b2b
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_07-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_07-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22a814590e7a4d3542cf60483b88a5bded55968f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_07-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06f847962536e81bd41cafba61201d7063223968e6d3bddbf8d84472a4f11802
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_08-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_08-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63c928e9a0e6383a8f279db52dc9054e394b9c36
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_08-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b57f0b9555e2031958c2965b1b16784c8dba7eaa081e424dbd9bc07690422ec4
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_08-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_08-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..64b72d3d7b82dedaf7cec2bf6a1f6c635ae5c508
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_08-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3de025befe2ed3329a91f42e90804514a3f8c0d0bf749ffe73ea19648bacecd7
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_09-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_09-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..73ff8641d419d09190941a2619d2a8da44895d77
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_09-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dac597e8c561cfdda2cbab72a8f66ab7ee00b86c7995d5592eefecef3466e457
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_09-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_09-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b0629a6673890fbe90941dd5e0a233fe3f772d78
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_09-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58a423946cee3048bfe7ef5387e5149f29b6453e32c8ab360b755a8e2520f7aa
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_10-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_10-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4cc90ce8f0a9a5691f65e513b50abae29d5afa19
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_10-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:138211b18a6fd9f1620fc1a2fc1e290a82dd7e163c3678d84c9a8248173dc979
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_10-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_10-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7822ec9c074408bb7d9e5903c978a94573819a62
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_10-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c02b5142f0648d50dfa9c74c8ef53c95a2c0d11d81b6bc2713f6317c3ec11bd
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_11-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_11-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c2e927cfb26fe4afba4a06ae72f0169b887f1b0f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_11-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e71287afa7554bb9fc2d7f90198b00d6bcd8d3306db3a6a192759186537fb240
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_11-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_11-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c04b2ebe90a16de9c8177963f896442504c3ffe
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_11-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48e3725c053e853c532c92f90eb18a2c8c7a01dd557ef5310514e7bef74d4c82
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_12-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_12-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c95ace7b446f30f054dd8213ea6ac1ade7d9f54b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_12-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9ef3bfbf4191bf22dcbb6e92dcf25179b83812d073d419825df48ec83164afb
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_12-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_12-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..106f85831557b8ea49409cc0833fa2551bf7e7a2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_12-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6698dd5a6c6eb4739f429e5f97e1eb58733271ee42cbbb632da67e7cba3c545
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_13-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_13-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a22ee9e0cbc5f64d32498f89468bd55f6229f81e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_13-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0efde252d8e6344d9484bf7a0ceebb194b9aff95a9b0332f3c40ce94e1f31509
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_13-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_13-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df0382211f05a4108d98ada0e5c7b0931520c3b9
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_13-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73b6b3a231463d14fadd854c0915928517c292cc44dfee80cf983ec6b76fce69
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_14-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_14-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9aa7b30c8e40e3227f44dd73aa7ab6f26726583b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_14-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b49f76ae6c01f310a41d6198612fab9766c20a1bc189beddf80bac66cf7b3e0a
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_14-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_14-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..44c512ecf5ee11fa715d473e3c2aca3da51276b4
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_14-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0d93fea1daed215398604da44b25eafbfc0bec3d1278005ccd4038f1f5a27a9
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_15-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_15-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f2fd4d40c604618ce50c48ca6b8f94d6bd4f7bd
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_15-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed6b69947fbb593ad810552ed2e7789630f379d0068e4e1839e5f780cf05ce28
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_15-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_15-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f03791f0d82de4f039ac3d53138c4c443c55fb4e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_15-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:162ff97276ce7216f24312e50c7112ea9c7de34a413d5386a1fff159f4f2a4be
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_16-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_16-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..13dbb6a2f823ae9d768bd036d729db7e37453df7
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_16-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:556d9249df75b6a21f3a4d6423126559f4111dc16b033b36346fa284dcb3e2f2
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_16-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_16-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cb9f46ac36ae83632ebfa706759fedec26d1e619
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_16-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:351b84bc271072f884c74684692cac3e037cf5b7a04ff677b7e590541a3286ed
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_17-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_17-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bc84e8a878c43d875f7842b02abbe085225c463
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_17-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f798b677824975ee5272a74a3889506a831b6e7003e6cc2d8e80a21440fdc72a
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_17-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_17-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cabf679b28a711c3f9dec0115875f79ce36ee516
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_17-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d31ecc8e07452544c1e352d34ef83f2debdf36895e902003f66c9c92d4c279d
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_18-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_18-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5952588836d2dec7b791c1b3c6aa1727767c770a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_18-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f71c1e752afc886a8c223696d9080f359e2c96b063638fc29784acd672a57eb1
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_18-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_18-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0dcbb91bb9118ad39215549794ad4d19f3bb8587
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_18-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fce160df6dc10f39383fe82b4b00d2813c829b25514f558f03a2bccc37a58c83
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_19-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_19-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..93a890ec0fd7a197eb0bcd5248eb1030894437a2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_19-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f24e304fbc23ac29e985a769677b0d3ddd432191339a8990fbf61d957d085645
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_19-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_19-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5594d5bdebabf12200fb6d9426064c3cc3a80edf
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_19-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:faafcdbde83abff80de3d0fad363905be0820cccb0ea8d27644ad0a52b0525ef
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_20-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_20-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2c2f89a5344c54f599d0fc46d0cfa6d3398fef82
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_20-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80963466a45e67b07436c4a528440fd48aa78269d38d0e7776b47254537c8d1f
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_20-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_20-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1dfe73ca1612720d384adafdfee2796cd4c7d54f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_20-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f8b8cf355dfa59708a3768ec188a085df4dccc7020e17d9c11a90516a66302
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_21-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_21-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..45803008aa6b12dc9ea80da04f30c89b8f9ecd5b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_21-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a56a4b10ffec445530eb4c2762226ca455edfdccdabc1d6aa798f1efd39c8cf3
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_21-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_21-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..597ca97e6ce52337872149b1db15481037e8e35b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_21-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af7e8da39432e4a0f09b923e7ba4a98710b3651e625527fe2b9db370a7c7fd6a
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_22-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_22-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81265eed64e6c615993eb3b5605a6c11ce6c099d
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_22-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0589abb971c14ac01fa798dca5a34d95ca62c231b2ad3f840b23fa3c83bedfb3
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_22-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_22-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3ace910eb34ab365cb67d7d1e72a4033df2f51ad
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_22-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7100ea2b062370b4140e370bf741d756f0262ea10e3902ee1f669826b2145b0
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_23-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_23-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d336a6417e05c593c12db9cbf9e231a8e7abbd4c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_23-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3878de8560c28ca282c143322bd21ed9f43738fc8691e1dc5037e15692470561
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_23-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_23-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1f626974072a7b746734d5ec336f203c7e3b719
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_23-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f7ad3aaa1172e804bf54ca5ada4acee0790b7dfa6b0223fe723399f089d1ee4
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_24-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_24-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4b9dd0b22518e6e2291a1b74179456219c0e3605
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_24-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ade330c93a006591db323d097877ff88dbb4dd4164f4a1859cc6f791629a4ea3
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_24-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_24-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..95b8b0d1400de376e5a6b94e32dafe61f1c1408c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_24-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:617f4c71ec0a184f379b9c17eb63a0a3aa2ba1704f103866dfa8d13eae334857
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_25-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_25-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6396531c0d013efb6735a58003d44cd29f63d94
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_25-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0955013da77834a10717b5beb4b4ef3af499d7bf04bb1b9d38ac685dfaeb45b
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_25-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_25-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1fcd2ee949b2489169638e53690eadaf3083a21f
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_25-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:710fb74eb770b0e864a81e1c4d6a4478a6385468022339a84edbf3374dc4bed6
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_26-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_26-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7303e858077b2e9274f2210fd76876a2e5b59219
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_26-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d309f3adea8cf2e8cc144337417a690a84d8de90348131dda9a8d65a9911a8dc
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_26-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_26-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3281365bd8897a88f71891176136b4b5c573a946
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_26-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9be8467294bb8ce28a3f79292e089e0680e47cfe1032e8a97ca839d036be95a5
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_27-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_27-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9dc1d635a63cbd7b4e3e86b2b1058fbdef7e9a24
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_27-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a156e08bef033b25a88df679ebc2a1a3473d392779e01bdb6270ee22c335006
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_27-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_27-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..126cc7502d0aea2caaf7817730289f9697bfe1bd
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_27-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2344a557f2e6b23901d89f31269a3ce9aa1ab778b3de283d0dfcda2700bca7c2
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_28-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_28-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..758e82fb44197bb0ad77cdbc95b5ad9fcbd662cc
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_28-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37b72c1ebb1ade5fa83f65683ec877e204eeb10575fd36688c1c1d0d2c466a34
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_28-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_28-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..20ae30a94c9e5b70078714ba81655f66993bc42c
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_28-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58cd3a7b99b3516658dcab9e0972a6fb7eba63bf895298227a711f89f0acbad4
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_29-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_29-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f9dbb5e3cd76c5d4ed20ab09fd4821017362e69b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_29-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c9309257b7d6fd6ec45447a4d5caafd19807063838e6fe9413b323efcf3f1d7
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_29-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_29-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..15531b2d8fa5d776c4403d324b8966f218d431e8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_29-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bb2cd0507212abe0d53206551016ea64c45be5b3ee5293265956d65b7e4682b
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_30-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_30-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5873655fb556ab6843ea1726c1e9a97c32cd526a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_30-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50535c3c468b67c315d38b3d7aca19754b3e4dd2a0c680108734dc751c90f389
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_30-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_30-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d71440b78a9c31f3dfb26aec444d0b3b37bc49b8
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_30-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ff8aaf778cd01def920ed91b5d5a8ec0b67d4374b52c678bf37e6488760e3a9
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_31-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_31-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..17273a7c70e70f1967e8ab3ca926a9d64cb91d9b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_31-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:135b0b789e13565d7b01a201e63303a3fceb01fa3ca87205a7c6a3631e7b5050
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_31-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_31-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5dac1c4ad1699739707ba61e1b48a8b9c5459743
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_31-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ce7afdf48ee11d858f6e6fdc40bd3a75aff4b5ec7f327765256d484033af41f
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_32-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_32-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..34b2c6ed5830a10f8d40c11f01308d3cb6eab55b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_32-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ccfea3fd9f572be7304d77133f9126307463a5171db8d8955b6444ae993795d
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_32-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_32-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09e41853cc630d95b57eed8649360e89091e599e
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_32-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c9e66046e628e4bee7aaba9b2bdcd6480f49e8083ab4eddb6ecf4a5a707ab16
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_33-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_33-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1318f470d4629e8a1a28ba4a4534ef14b9c27bcf
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_33-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:792c4a5b443e6895d8fe8326bc396f5a40440e4639d8d15b81532d4187606180
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_33-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_33-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2560782e2325eebbd22094ef85bccda43f01c88d
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_33-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6416e27fed5b1ae3a513879fc9a10d0f5674cbf8500c374779a0d507e7a23ed5
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_34-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_34-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3bc2217c28afb08d08f1f86b909cbcdd14398faa
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_34-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ed481a36d4ce1d18681abb9efb055629ff601d15df73a9d18ad30d87486c10a
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_34-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_34-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3aee79b81e7e45f9601c3e265bdc9b71dcf0922a
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_34-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99693008275f4ce47157f6538545a406338e3f5356cc7aaced502f6db87798f2
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_35-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_35-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..007bbf4c2683983a610755d479edd92c91354331
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_35-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0638ff90df8055f95f8059bda6e60bfd9cd49f5408be14a377a31d9b0e33a461
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_35-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_35-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..44be0e6feb838419ea969de8a08bf494e4bbe67d
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_35-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b59ca0d7c4ecc12cad623a4f38c099590a498f23da554b4d16499e9b3616d0b
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_36-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_36-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b954af1323dfc5a0ac8253044a51aad54cf27cfe
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_36-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a30b1d84c1f613cfce2cde6e5191d51246c25cb38026ba707df653f5b309f53
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_36-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_36-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a6ae6f3945d00a7ca6e4c5954eacc182f96cf3a2
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_36-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ad11dd6b327037f7172ce3a238c4fd457f7b703eb4e8482d51dd64b3160fd73
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_37-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_37-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..29a2ae3db2fdd9ac53591e170a489508f2210f5b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_37-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a41411e39b1ef4dbc58f85f842bf80480d0dd57f8f9e40f333269ecc583f64
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_37-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_37-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0df69518ebd27d349b440c970f31bb1908b55e1
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_37-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a714a409362403f86e25efcfb3362b71ba84dfd387da3d72c47ff7662166dbc8
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_38-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_38-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4e62ea33845946651366ea6715b9ac27488f16dc
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_38-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c33417222edfb090fcc12e223531e6f41ed63446ee475bc1f8ba689a9ab75d5f
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_38-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_38-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8a0d53ac5968c399c6e660e3f737035711c32ab
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_38-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0efd9471b6449cd6fa759096f3b01615d9d70a1b71cf630f6175339b35ac8558
+size 113308931
diff --git a/4b284b12boscar/global_step80108/layer_40-model_00-model_states.pt b/4b284b12boscar/global_step80108/layer_40-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ec01145813c4fcd4a7401243cd586e9a6c659b9b
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_40-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5721374cff6b07909282fe3c186f7b3ba47d9217966dcaa531f96f614f9c91d
+size 13507
diff --git a/4b284b12boscar/global_step80108/layer_40-model_01-model_states.pt b/4b284b12boscar/global_step80108/layer_40-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f8c93b681594ff7278550d13711400b968c91b36
--- /dev/null
+++ b/4b284b12boscar/global_step80108/layer_40-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb8b0b0c1f6f642efe61d3dfcac8b63fe6bab94df76f67196ed189a69052070f
+size 13507
diff --git a/4b284b12boscar/global_step80108/mp_rank_00_model_states.pt b/4b284b12boscar/global_step80108/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..73061b35360accacc76ae37f516b7d205a500960
--- /dev/null
+++ b/4b284b12boscar/global_step80108/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f18c0a9c53a2e8b3d0d7ff5d88e4a2837dfc0d91789f9870704a8923c4f98a5
+size 51635
diff --git a/4b284b12boscar/global_step80108/mp_rank_01_model_states.pt b/4b284b12boscar/global_step80108/mp_rank_01_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4ccc4fa0f3247bdd275d021f3b66ed0c91cdb4bd
--- /dev/null
+++ b/4b284b12boscar/global_step80108/mp_rank_01_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec6780d5fe37ac809d6664d8d7d5e15c25b6591ba534053c1b068bcb0f68c743
+size 51635
diff --git a/4b284b12boscar/transformers/config.json b/4b284b12boscar/transformers/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba
--- /dev/null
+++ b/4b284b12boscar/transformers/config.json
@@ -0,0 +1 @@
+{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"}
\ No newline at end of file
diff --git a/4b284b12boscar/transformers/pytorch_model.bin b/4b284b12boscar/transformers/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..243e6312b30215b14aba6e3ab6666e92260b8c0c
--- /dev/null
+++ b/4b284b12boscar/transformers/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:578535c72b38a763523231ca2a8ed52612e8836b58227a96ac2d537a8db4c27e
+size 8781203669
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..20093dac2711d7e19d03c85debc53d71cfe2ec45
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.30064593181269955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.026879704157341512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.06703651854735132, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024391085403758696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.26740772484207054, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004912155390010422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.09060657840339362, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001978890806658563}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.028179323090803912, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013431147572720058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.11491018942638491, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030470199083947793}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.039330605607392446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011751573991520296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06406482680474013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023017838075157946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.257657677037235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004746012740112394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08688274387361468, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018578198715780445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0634475056304135, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002344312215241603}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.24940060733167396, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004590470603510631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.0854912841791626, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018955102722507356}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..53a9c183f5c79356bce8cc96dabcd92e533e53a7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5448014860907993, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02354902424699172}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.1388827638743681, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004513090850395629}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.30199833689345784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004833949344945251}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15412558821549818, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035554299152960674}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06750783374322028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002988686287368457}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14947512582685754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033695091348626585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07545981652785559, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002357043487541018}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12450746932788907, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003982256290142074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2819898497969715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004422334130125733}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1394146722460062, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002993354560745065}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.12750187523825526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004097239372173061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.28488318300316834, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004471716742740801}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1421792554091385, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031066004309073594}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f731bf3668d60da3832f13e28d80af969f61e68
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7494822896647901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.045680162494902386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.16651047001316224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00503164302116687}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.34879486947967936, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00485184387176236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.18686493440413654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004140752549886329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.08795085013666165, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003446853896796009}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1803167154721538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037373578885484243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.09627247599444268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0028826866977524927}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.14757163053942288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004365281542114343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.32518786802891597, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004501197296979544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.16801781294867849, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0035458321298973234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.15178195561235863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004526255293354419}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3286569936174265, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004546078233004703}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.17158619573308823, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036701761982919448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c43cdadd646243225aad0e2254d5d784a34a9921
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.8754863248245969, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04360822451403643}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.177665566463972, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005213159352117334}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36388817345982977, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004942659870867192}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.19673949819066772, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004214599980941433}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.09644734799007487, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003610059263232589}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1890195299674566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037165922348924205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.10183935852735193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0028397205254593197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.15715897881351443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00455057510376714}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3364361502529422, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004479392424459559}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.17564046576780104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00357350787655052}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.16277909758999493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004769154955939382}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3412001797439044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004532856553347463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.18055401062696452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00373462611859697}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ffd6644d6e50e0f59979415ea32f3036cd563832
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.9609553724467281, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06219622144359429}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.18155711637430752, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005153207013059504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.37582358230783497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004868478990903738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.2035685591724469, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00420720523051007}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.09862370157443069, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003550556891817434}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.19877038562348062, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037326666334346983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.10691762381917162, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002903183299069371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.1583476022875235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00437564418742707}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3452891933106154, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004375446165777104}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1801505100456093, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003508323835548118}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.16428760554962601, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004585243637248548}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3517722828354071, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004454120443642119}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1856568180747029, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003673260642261377}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfea76b1ebf6e9de1a16859701e8a9b7a08a2f6c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 1.0578470733972574, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06748885712032304}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.19481471590160654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005470101363136162}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3790270896990674, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048619650256576654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.21188453557964756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004406461443397243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.10948046927141483, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0038119634607733702}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20526211579883663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038681925059066017}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.11512570203467995, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0030866056829496383}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.17018196029834431, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004683975046675936}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.34939009051908676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044237864033617015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.18813765770684918, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003736931314323814}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.17731841714776353, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004926877660252843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3557595779694418, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004485010413076965}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.19423676766572612, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003908165471883654}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..534c5312bf9a190fea11c95c386663c2f8e4079c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.021230789148664757, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00036930334901350486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.1458873272470996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001900441045033603}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.03584736696676297, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005714702584391066}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.0007014836062982539, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 4.9656181708739825e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.004439318110846241, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003216075158994171}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.0011821469183760484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 8.251246944497151e-05}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.021217584488589816, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0003682938445043042}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.1458356603789047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018993886668088465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.035826640478609555, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005700890073056944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.014393803982682208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00023960077177274328}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.10436119389152143, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001369730349153955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.024445633867760844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00037319784494180513}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.00619859112106803, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00014355209252272556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..42b9eab18a0776f34862873da8996a91076c9130
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.40779764093890236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006749470759799865}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3339982498926477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0051660582576264396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.32230266861303564, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0048281759328283125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.19817732601937044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004893873974067158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.1573726221734629, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0039063568571845834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.15178353399364186, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003601245697432249}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.3339851607013017, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0057395182257868795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.2772342692279814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004502719763779498}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.26354330056583525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004061719501266238}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.355640558427129, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006060447943606269}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.2909276700332092, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004597339854441208}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.27948934790150487, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004243290285056338}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.113913674438396, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2845444346534936}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..331daed83b45a23fe50c166f96f0f37e099c9088
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5939976969577568, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00597648294614709}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4939964608830419, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004854390017630889}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.4849380642186884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004355815746994613}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3429952661183253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00515916785443233}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.28183473457747743, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004314448844451297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2746788042383533, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003972200087077648}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.48926352683264596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005477667118032463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.41141002664463655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004507582122597571}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.39871995851472236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003961110866080806}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5218315934141331, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005640661046456062}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.43384665246421655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004499937439083978}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.42346399930907747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003988595875260355}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 10.760879957677538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5042906708162518}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..24f3adb0521896a3e3e61c86fb3108342cc7bac6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6019219169272445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005604012930862085}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5151330504249194, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004815894474393196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5053991892967815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004167102690049179}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.35018891335285957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0049939012831142415}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2979084423676019, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004361003950778779}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.290027555024941, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003991847005443158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.49562886803557976, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005164938327908107}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.42812641981008775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045613651384270254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.41633220740186677, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003931067918794591}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.52905000223195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00528992708924929}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.45306067770178426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004541275855320315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4428486235419869, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00391944756063056}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 12.369211984052843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.448852909648781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..aba08c62b825a66d0c905a017a3ec5800b86aef0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6140783854575388, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005431009115559044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5234983456818771, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004768197563181045}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5188091003949884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004001638126026132}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3616064816232795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004809450409130446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.30875562772999404, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004425066895675964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.3021394744685031, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038954845230459974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5044009302215042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005013807703028974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.43375082514527546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004499271894605309}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.42595166532510825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003777679763490047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5413307307536466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00514634411644012}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.461768303468851, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004504232243207126}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.45558938836923335, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003763313157684693}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 14.17484508960742, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5664456945961425}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7dd010444602632a8524f7cfaaa585888dbdd96
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6191143144670236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005337442887903016}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5264732013696803, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004770759694377333}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5244606185874572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004015174635271785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3665837047538975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004873489688947118}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.31075640273894506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004332106658701811}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.30622163667203917, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038706333867826303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5082660653417226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004949576580934907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4364109573951558, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044637214108884105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4306090940048975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003740221372195741}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.544246848734745, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005082580961481355}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4627900854261117, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004456263162285971}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.45924248488226455, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003747137495910571}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 15.295746341312748, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.26791338636163015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0eef42fefe741254c900cc62bcb82eed22c56a95
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.5105145423402562, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.026171756855261356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.05037228727550738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001521468424950354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.3071218918316016, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043691999674182}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.07765807357653474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00157051415399817}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.012731823828237326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005953055496072427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.08372739502164273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003169064201453838}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.020127229000738652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008029287449570219}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.045538158272609604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013832440826860991}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.2891643639944347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00405859491152379}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.07063525535553608, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013364210821022566}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.04196005053080567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014483655274799454}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.24958331003009082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004179855124707589}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.06378666642891567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001443233877006769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..babe5a12f01aa35efc3a0c339f6bfbf7d7f60aa1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 8.939610966084388, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5491296743636112}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.5062915093921636, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00623855847389292}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.41058204082882704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004939933475915485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.4021073167056978, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004419809770211547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.2697944089487611, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004958802537142803}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2134665719582161, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00396690561907614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.2086623721496119, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0036579604366172988}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.4167417293385908, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0055300014427683885}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.339652862796795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0043904468358186306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.3297792565875231, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003854890830252956}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.4428822362875854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005748715854833886}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.3575821685337907, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004450913161326163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.3492594208071959, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003952122386520726}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b6d39b1b4330d2bc4e81ad8d49f4002bb47a3b4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 13.301890998672494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.28505139412189195}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6208153521814332, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005474019869143439}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.47992261425880245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004722131355129768}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.49670042513083745, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004080443427810968}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.36134920028292394, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005006394373239957}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2724687776194162, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004121139027181532}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.2820177000325173, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003846612306357234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.513387382273534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005057388660088079}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.39803668729645497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004363439679150111}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.40938950141860875, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003780074803334432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.545270404965477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005188121414246482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.42076726693509325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00439492850178596}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4338077009112178, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037912006207288774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f0a5197a8c0d3516f09bc915f8c3d7cb871d562
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.016710673793215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.239430646843486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6350567970797427, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005217582777531534}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.48991548026332876, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048380103610504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.510262226432751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003962805128007689}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.3768399298727737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004940475252559112}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2846474758377686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00421808227846276}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.29589604538503367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038655166790178765}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.528667040055799, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004991662892266718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4078265767945971, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004463504554334357}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4229438793809449, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003751130990106598}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5612664782836534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00506727365699781}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.43042919293423204, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004457143541786356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4476311036451381, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036949691242316235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..afe07178e03ac1d4b6eb5aa8d3015392279db73e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.147226334370654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21991183144780196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6438899385054072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00518110126025396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4890513089689242, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004794116210959723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5146917306496103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003960214700402593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.3827536576685021, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004895445210280526}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.28694164018374313, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004276215543973811}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.30042048261651383, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038878273195587917}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5359166963471615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004878807275767118}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.40806729713008705, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004444151439522105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4272118078716951, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037310099515515628}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5685304722579841, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004971555014070406}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.42853658116271176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004446374648146792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.45115401432632946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037186979256893797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d46c885cd311c3f6ecbcb8e459722a169d2375e9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.415231653189736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3973135244971607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6491599899632405, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004964212601778852}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.494038148373054, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004816716712730207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5219352094405624, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038752976995851656}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.38903348514620983, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0048629347647574355}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29268387759461634, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0042689170185173505}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.30681215749599383, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003817111253434826}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5409007074057642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004834940468527818}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.41236919893457097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004434351880386135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4333675825221648, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003667286185960674}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5719383870371011, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004840938070004957}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.43375185696655677, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004452540065472888}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4570999768435371, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003618688980073504}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3889228472370f32ab602cab6d4bf080c36d1e7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.04292647255629735, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002183817456220973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.21544171884055693, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005550023566245474}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.05701313684996747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015578272366606155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.012442143378991483, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014472589586154872}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.07244352525117184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003313935921053701}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.0161244583453129, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008645866446531068}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.04051731369178037, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00211528013278168}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.2069649299284769, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005347874387449208}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.053695049084465485, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014291447250690515}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.038419638740722094, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002117434826745668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.1872359151843902, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004719207977946223}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.04971651734495408, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013529088641741497}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.2878774611129946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02191645372253148}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..09cc399bf8a2ea84d0dc73133448b0b4f790cd57
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.4127938000153225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005947870279386254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.3825281448261863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0052859610720223885}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.349736522766564, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004520392025115228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.1913215925740367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004557837262848308}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.17961092041143203, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004131524292644386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.1603801895545452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0036128830354525514}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.3375779972211795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005167676099647845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.3167769212968609, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004653884254645602}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.28533791057700214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003862887782280124}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.35802129740566746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005342789406951055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.33193124379512834, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004717518923135464}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.3016787877742329, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003965250670159132}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 5.985300509654477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.27766678463599664}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..92f7dbba83a135167c253390e566dc98883a02f2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.573178421066209, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005581271283388568}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.506749252047594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004824630423734974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.48887346865850884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004124343604769924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.31664607130401096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004795448811178974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2793566561351489, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004295985376268616}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.26639799997553276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003828474192856928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.46571012037778253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005055354594170153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4153448849912012, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004462294046186775}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.39656298401250717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037775849152242224}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.4980514939537973, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005216216980737215}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4394122607391806, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004474633124329668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4219536083199477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037956133967700628}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 10.836147959434891, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.4439636489991431}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7bde98ad2c2995c9cac0eafc825b06b3b742ac1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6216298638120148, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005214926213611381}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5151245895293591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004798742376668426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5199191464585726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003921506273107964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3561340758337231, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004856631792498089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2923563322177373, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004275271282396517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.293158556124659, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038860344856338604}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5065253572253645, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004906755746264397}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4226271623332692, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004467796136304541}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4231221543799515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037057307812666143}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5414810935939267, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004944762049361074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.44852332402277933, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004450097989753056}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.45111782021645674, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00365315425959574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 13.831489814005128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17913601909321958}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4744c199caf4bf24cec5c505619c73b0402c70f6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.63710786327223, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004986809833607892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.513518509009756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004813792451748782}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5282464366263173, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038579340284982727}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3647903015535913, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004731428227236603}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.29336775947257837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004290478506270675}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.298218326766099, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003802989738412845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5152749151762644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004682500347428617}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4178051528960531, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004473683244984259}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4262980387789171, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003633393223889575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5529446488559734, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004785169149105944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.44435272882390914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004455322990549037}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.45588243723533217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035980738333845344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 13.968560539400379, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2438004666871702}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d100d435d9026e2a3d0161dd6989988172e0ff55
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6467284283438648, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004898296462472164}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5134564578559546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004819956408111058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5315882216040946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037313655895080174}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.37350249398383, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004794722873620433}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.295869273851471, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00432835566336737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.3023749154277527, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003782427307375681}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5269059518418968, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004687269593553424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4216944286006577, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004521154789292203}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.43309432687633015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0036123106429110877}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5632821205544838, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004755701765839582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4464070974868443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004475800727796388}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4608334361195571, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035311597997223576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 14.284859993058577, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3412668530317404}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..731be1375518d97d7613ae2b4ee688f5f5b01257
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.09494981088402522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019916895592894824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.2228998019687395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005899199302218869}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.08842339112625007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017984454249847302}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.009491868531725244, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005473549381635728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.05937321877500494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003062549575730396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.01550921123717699, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000863216419957016}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.08719336306054104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018850492893613022}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.18578635977756294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004400753558487686}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.07629382730868248, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012776155056859815}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.08924979026386516, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00198243157294504}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.19477434399246124, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005308381914737892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.07940975128762484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016387793706256538}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.14362877072282437, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01599760640382167}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..70f3df4cdf9d3a893315f47c88a707002ba9e783
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.5112682811495951, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005540977634393473}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4579965048546065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005165963633977804}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.43390542470548804, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0042841473342437875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.25940829162472157, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004518501612861277}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2356293403240199, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004304856124437503}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.21795173194027365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00365577553521092}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.41368010519825554, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004946748631043231}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.37301320326760096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00456992163294301}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.34935816010638454, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037022423523572394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.44024779450138224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005089317446235496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.39323718964491866, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004624020652712365}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.37067691947406095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003757092658851965}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 7.860939478648252, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.397136024152674}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1a52e7974e52c38d7eee7db932c77ee2aa197d0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6154756678079366, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005226356611902434}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.48775772286111885, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004918534040367806}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.4995611810480339, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004047233978311527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3398699521217031, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004795740723998624}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2695930310727578, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00425732564977117}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.27278137993599993, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003848244890407377}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.4999239471058093, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004837761355126683}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.39933195066497373, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004494917857537742}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.40537627802968923, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003711457442747327}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5322953675512873, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004983631285334954}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.41919592625717844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004479048342642623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4283890702376615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003682929524417652}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.185026341639746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.26708696501401935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebce381532903f60721e03177e30f17ca8ca9e39
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.635650445531571, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005052022120074668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.48710114427680495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004948848380530543}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5099797708749086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004002650362686148}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3589984262150251, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0048015517732940835}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.27518191624261584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004323754355044185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.28524387854268946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003925040473289598}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5130845093457147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004695868884598406}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.39623149975458216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004526826786948583}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.41158287666032306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037209996181527677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5476870363212731, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0048318445677514536}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4179872547044159, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045119446034739595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4365260865248515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036764639046590435}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.62634123629319, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2884546742216957}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..544ffdcd3ebe64861802d7acc206d977e701ec77
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6548086825341891, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004844894071962483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4885276740034546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004992206467831831}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5189004515187761, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003932134113995795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.37233831020226843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004732811866114619}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2802554209845624, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004455143887198717}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.29267546193354504, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003899222651754383}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5299274777087314, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004612940099912932}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.39803918367090574, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004579736956902424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4196684086168609, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037106616792705995}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5650146079911865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004705605198203453}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.41970914366117246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004564652484857213}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.44492007481859275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036525630101091125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.414902042012542, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15981954068986562}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b9c8de202a8b3c9765f133f3cc529b602116a87
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6577563774012802, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004766886700582482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.49129937789940686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004940830750132916}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5217786379718019, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038125394534580656}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.38214760041810264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004673723142913314}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.28616105569647965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004324909290729554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2989602811652376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003725211805563152}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5362327775987002, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004544774276661362}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4035564959053218, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004526430360279699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.42517200603030053, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0035797978808816975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5690808761307263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00466882829818349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4240291676864204, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00451863928482515}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.44858456556468956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035259657795515694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 13.109705721827863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.32959907803487387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a53c99cf7806145c61b135fd9b5669dedffe7efe
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.15523872132511843, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002548593592832838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2578540135540898, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003573621223967281}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.17691111022689643, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024409979706619444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03670831515447064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009288154609525573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06435957416412438, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017209958971101799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04278206373018957, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001017383953281967}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.10782482061267594, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018535006806478674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.18517259988551665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027120363581219203}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.12316699898062232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016560558398715924}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.14436386999444625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023958601327950585}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.24020355278235528, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033575116915700192}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.16436893494623508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002269262360679833}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.6688327716635425, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08895603768971656}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bf6253cf7321bb6fbf04b6f782261e168966bfe
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.19251466904858092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025018512878404245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.26482791100683756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002991401125215465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.19817292293453556, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020508834598208367}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.04273398024706325, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011142605916336127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06063785632750607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015272956454286252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.044030667142745485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009919231933280177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1394015875344974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001826742692456596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.1940499587002955, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022281816682500535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14314934581168257, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014037837246694274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.1781067800042176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002312722439135384}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2461164131763784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002792558935536892}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1835790913887019, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001892477940762234}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.2866775641263093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07082070203339437}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d916cf5fb4c69de4cdc06a682a31b01b42d08fe
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.22622981875069514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029442678786812575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.27051388062395576, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002891869201932646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.21440807047777405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020452435106786196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.05650119739076315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015075926525400492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06519053099084247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015252802022823956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.05125393540374588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010901442769842603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1655500657679839, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002257578647509859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.19876519282475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002181086524739298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.15582785215592246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014510195081137937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.20977427939720425, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027791163801520113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2505198544806778, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026988444846593263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19828406308602203, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018921021547870405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.6575458090890987, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1041271763194214}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..981cbf9aa1b798b92072dce2702c45cd1ea589f8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.21044043846517407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003415454177959976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.22091950701260638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032225297099515117}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.18231062776327114, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002382883544618064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.05593482165530236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017382726978577543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.05513971414698624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014788145878236471}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.045684759403535856, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011288709968297803}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.15945923460088876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027306827278561255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.16575564694153058, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024709668753929714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1358859920094012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017633408647579247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.19655719112323583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003246109842646873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.20527494742011107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029956447445675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.16944346876676417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002216821657918166}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.6322531721634923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09664327595514159}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..74a6a6f24a5f2a2c5d5dd208f6228944a3b9e046
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.07543339925751304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002912744472352012}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.069662997425629, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002574832610229587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.05957194176610327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020806329854399994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.02006705843647472, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00126274734686985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.01752499067978901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009388325078673943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.014937212652877212, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007702355493743181}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.05854875141211828, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023530742721753948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.05294028592073027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019759992033135617}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.04508232351505334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015754425551504734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.07005378436649634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002739806542216889}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.06422067776590708, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023755736065222944}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05492996905505073, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019232228948358645}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.14536104754565293, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01854835494441183}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..16cfeb4d07205d6f18a7ac88a90b28e9bb6b0f72
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.014365581022065697, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001560512259717543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.010914106815652429, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001117758970327415}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.009728718774236622, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009456594824355727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.004079235472292088, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006970061514227833}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0027410666892011626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003708439715072337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0025962149623228593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003597331522493365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.011459758165628971, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013480152992941183}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.008202875925360011, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008427013537304956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.007361100442935105, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007285180962432676}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.013658620509020312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001505727520339385}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.010192549845396262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010464774903728251}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.009121295941974078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008902015179642384}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.4572877711432085e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.3364596700068064e-11}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0460c7c030ce7607119ae3c29b6bf3b4ba49319d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.06452403733371909, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014967240435340412}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.08494571623153917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0019362539528511185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.06517623892397213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013838699418831327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.007840253016222527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004579579103774128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.011290392212911112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006492975987186066}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.008217000605831354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00043087034048410583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.054902554802216445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012088332725248313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.07398900765280814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016591560862410744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.055895886010082024, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00112519759543134}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.06060617538968794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001392308983087042}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.07979703481975739, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018051602334693293}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.0611621603763303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012850318876890584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.4889087616070698, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04970736558027736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b0ce49cbecf12df107d2851ac1e060c20f7ab72
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.1151865879029384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016250307877578995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.11071719364535992, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015355647070335555}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.099574793692519, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012103234622713214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.005991291271737141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00045282560241909244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.005418015321491166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00038092535887569517}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.004883245190477912, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003120388028651935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.0927927612021365, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001300637168444661}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.0891743403262336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012125027041917508}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07964868005031557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009142347526244936}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.11128782906033827, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015558741024301066}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.10709447290067853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001469683871656218}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.09625017841568764, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011571719141370627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.38700981358791664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028096574852327708}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..57342a56346b2bc868df2bd7c7c3b0ad4dc6d476
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.18923293051242865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003931215744109829}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.14888789271257125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027634371759885064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.13402973283377853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022221428328858175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.05250476162006954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023465752364279506}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.03373827776460467, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011812065114166165}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.03139270460463245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010530936216454475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.15396186822994368, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033958787799507834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.11797169313356873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002188824119755982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.10624803463455436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017616455857783213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.17982708294304905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037912824701119793}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.14035126579839385, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025962250169538686}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.12646589294514818, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020915649804801885}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.9793617937243397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08162245357603662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9365ee21e2fa17b17ac350aec7f4ad860a9e5cbc
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.2111603613464384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004316562776200497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1609555599016601, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00308128531187493}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.1439432413777907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024332384864776345}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.06549752110515275, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002622796804725408}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.042567827326622294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014369849379045625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.038684529531259455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011859610306448835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.1729592486561157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037461636917488455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.12871277703565398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024988677821967836}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.11485216819737429, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019389059734766416}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.19990549390147885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004161498874034307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.1505907917034171, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028923842560353457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.13489114533789145, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00228554203025425}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.2238921561637253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05819769717187897}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d9cb42130e15616788734e15851726b0fafd4e9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.07178178912956662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030761995624091995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.05474694667796429, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002299511195674342}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.04800715721597718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018517108845868325}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.023714942805799485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001754604095092442}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.014969677658351763, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009231868199116232}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.013340223113737474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007807829472162995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06022721464718216, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026905863744203424}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.04482442172314591, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018971481069257855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.039101876969439266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015041565892311172}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.06748108589924676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002932963122342718}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.050561649482659246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002129232058478221}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04451495139778381, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001720795547452951}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.09176000483883211, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018538430182798854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca45d5508ce29830bb6276b2b6a40d4cdab4f966
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.011616993456994514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013805524814779022}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.00835448439931725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009502784109930084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.00754486001636464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000823337338392106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.004000350645166246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007784970288200918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.0022515189052559757, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00036203795760830825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.002198804033049241, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003741043115897432}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.010070274369408232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001239457603444862}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.007063094669971561, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008091196938805151}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.0063189425177954785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006911151227391025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.010987649566674518, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013277168653009403}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.007790571198885496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008903647625496989}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.007053847928732188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007783090333692333}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.591388902748712e-13, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.7248641435081037e-12}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aba9d6a7350450ac55ea2fa03fc1a6db51947f8e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.05003590805872101, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003008991351388589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.03897698027943584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001807082535411678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.029020887729209358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012886185406965965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.004163791361437201, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003541290963917177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.007124397102554354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006653760319599128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.004692985339977893, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00037296907132779064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.04565184907526892, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029236719566217255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.033207332283082446, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015253259864701307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.02437466011761543, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010326036760338344}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.04827889432422321, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002979117581191133}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.03635200073623344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016856668824093268}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.027066861347589673, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011944361879851397}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.18780593483984345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018823304784897736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..06d6c0ae65786c5198885bc999197dcadef9ae02
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.11799327511255576, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001719462692141953}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.11232330243321362, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015217035094757553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.10117935860269386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012163616286975524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0069239119815885765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000623654699605598}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.005623588502170896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003912742485234744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.005230543547670169, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00035444811513678335}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.09540844175890396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001400138877931801}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.09082951233546875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012149726294158937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.08118840865208957, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00092302844984077}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.11397225810714907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016549375922209658}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.10865018897124848, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014652624998135145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.09771333398780462, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011609788706252934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.46974098072284515, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03801240502570748}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d99c122aee8eb6d454b13f6c01a9db6fee6ea9c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.20996717805826454, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034007578731400064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.1993937087135368, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002731695024599099}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.17027994304378627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002076725188168782}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.051389185254084584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001897799812976046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.044041397040170155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012928785613979953}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03781539212782324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010547868001683674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.1650212857762001, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027728900367332667}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.1569078602081921, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021538351893500597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.13232700263489727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015446220080689499}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.19770727209848482, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003271890975763662}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.1868234597182294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025620174566918074}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.15952778726250663, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001949864660470336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.464357632363685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10634222940980814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..09a72194a57be955726e7ed0c1ef556fcaacb614
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.20782306336974588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003989603045660709}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.16825057480875721, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029189489594619615}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1507903796696916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023330196000993448}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0595807183516492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023524175410708278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.041168151935746036, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013717156421301187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03747403723815723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011258395271525113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.1665910745941347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003339398193955642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.13313952170536733, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023073953391433122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.11863694423078541, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017954646430025558}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.19573487618267474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038169172216808707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.15732403928093558, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027338622985711727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1410782911388888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002189993147895174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.1276225158037962, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09278611348359975}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4008410593322f19841ea5ade2f0cef5318af0a8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.07231401152182886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031436302597989876}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.04905280746691137, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021156075931983865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.046502363487238116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018245132184245493}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.021238504757177847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015693705294511635}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.012651076337640541, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008729050713848778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0121019455142586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007543749415825447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.05972427393508793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027048881388111313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.03901746728275879, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016833219673054504}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.03704860848718911, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014486764721672722}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.06781797173242858, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002977618733021878}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.045328572912779454, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019558428897344673}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04318243952400433, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016973104965278527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.035430233625646367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008458077459062699}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..371a4229a9d29f69e5583f8dad58b9e1b615b557
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.009674555582528548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012169889334882255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.007085196538062005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008674068166182141}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.006660488491670164, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007542427498051077}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0031805595203384497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007013139041818058}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.001691874973499856, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002981439928898985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0017019468190412445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00029911355932309857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.007534759253956328, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009640139065180261}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.005673494013808289, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007048591996388539}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.005265213067519611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006070437259395422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.009159996330829265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011664337061398202}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.006641483198385802, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008151851711595139}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.006249297337617345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000708537597444003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 3.413680523283268e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.916096847388761e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..36e9434baba1f6bcfc2a1c8da3cf034911fc66eb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.14048666426016918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027632901821198767}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.20669383392760043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003301837819297362}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.14941919223655403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023626814400315678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.029525585854477905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008710537995147101}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04703290588131188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014209190224071058}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03335939346289185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009250316805971055}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.10735133959204357, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002260925698512594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15988100122471754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025788969320249258}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.11304744711913277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017089859184516304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1306418700473533, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002629490470671884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.19167920541216937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030766125540393364}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1383102708440391, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021868611093705326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.2473591602494687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09068336197346967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6c66c02382501bde2d95cb7908ee81242e873a3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.20164190917278885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00340245779140814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.18321322169620108, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002763046996934259}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1614637512336107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002154838127118863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.047278865796894595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019439681128851241}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.03836443905737192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001290609856100645}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03413869729774695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010840564141073451}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.15753098406464713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002833744217070429}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.14069990485042436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021198152025286845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12354041881834915, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001615855049003565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1895150765616292, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032439901277073056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.17138871604556788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025668579918568207}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1510952282258089, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020029762719872084}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.161551083223167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08752015895630673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..87b8621b630c1ec86c3e44b26a7b691937f9aa04
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.30853356659765047, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003908986083294316}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.23611642752367193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028098851869033524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22448439980932258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022164668755803446}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.09422907936670104, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025861688595862934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0641343554561749, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001505883040485307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06229028061393615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013505011306650329}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.24269847651739596, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033468376687347985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.18285960289057043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022296534964715052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17363950691564434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017583318279909019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2890613207429283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037637245904156287}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.22021341816156315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026435827841586675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2094330802436139, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021028796887348792}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.6712694957538092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12335434340064529}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5695f2411d0d28457372901b82792f90f0969906
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.28082836293366026, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004319205532930181}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1968690045661053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030792534914180656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19386388642181623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025825739138006965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08943278692521785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00260423039924173}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.057174167865699795, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001596148314592036}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05709915142524781, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014322906346376607}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2250992736689434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036669609547972885}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15462182132992405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002462032347060667}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15256193731747766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002064934954063214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2658819512511678, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004162301116768716}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.18550776729783153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029178715298313596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18280631870790667, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002455551930143459}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.8478598408877533, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07777205711444814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..71f825ea919627df014d57b7368d3fe0aa45d66a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09422577510372415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035020894909113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.06252755874219122, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023819303265762287}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06231197551494105, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002187598780650555}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02901231903773154, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017246417352838742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.01773347814345909, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001029862057905482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01797607134766925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009610791037997163}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07680762398075403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029646336358863275}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0495501454165194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018970706209682415}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0496325144290003, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017538529763009563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08907486896592501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033434435220432352}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.05871435609309791, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002238019081316999}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05850840107044648, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020558492019394463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.06105217041635524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.010329575504157805}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0cfac4cf805e74f3a9642f17d235cf6aabcd6c0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.01542645344389709, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015844345606781326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.009780146620517932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010592344698244329}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010082013794884772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010101938209082168}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.00537468824664498, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008303851566942339}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0029150772505705507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00043016549798545706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0031781048041287503, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00045646146249046847}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.012798829405832771, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013675936853478024}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.007665110474443101, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008164388748390192}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.00807413792108403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008184301651151722}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.014620295797040817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015055626328950474}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.009149077393940025, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009886149667351712}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009445400067942783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009397610137449974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 6.646244004651184e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.685084737559834e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b55bd9f96417ffa284528794c9d38a74eb53f698
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.1168570051133667, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019198426627741788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.1481389494974355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002193623326652998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11485230802775526, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016046705097408755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.013144492989594468, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005689151589046461}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.018392171320423697, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008301672620875432}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.013633187300002049, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005587494363579251}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09620473485098723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015771479521240449}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.12284031238534873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017612038827154002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0938871115571235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011996389682465774}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11042358047739399, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018173803801964318}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.14011085746312696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002067141292175654}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10823902153271642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014865854550978846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.8119157520078589, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06804323722700163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5ca62a7a7520bcd3af9da263b202a06dc98f5c7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11550809880471202, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015189235287772081}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.11401586371732765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001531122722495617}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.10155038790836553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011942054554629799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0054870610531079925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00032521139602944963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0056622848349370675, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003699905567380415}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.004949458338597906, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002943527715399227}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09245029143745188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011784020884857262}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.091481469494611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011972745307006892}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08080820739114689, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000878736781073791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11136411939666795, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001448526312362296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1102168441246706, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014748044014163042}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.09799762905139109, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011402026800591278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.38221868011066523, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03853483789988312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbda708e8bf67116bd9b83bb550e33546bd011b6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.1457819637516262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002011906150918097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.20956318422385695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027712949082613473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.15340882653832058, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018318475648937066}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.02271156846877315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007950544243146088}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.035721592276879016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012958816708578905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.02440471180848951, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007647500431265382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.1077615509646924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001428107586317939}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.15720431590652129, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020890517297875114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.11306799842590186, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012328157824464986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.137408970351226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001890837273370008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1972394210829142, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025925068905230946}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.14426644390129503, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016951248163021794}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.3895694754568146, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06986390519153292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fd22223690c15e22a378cd9efa531ab16fcd04c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.118327473789933, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023211153395344495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.1694788781476762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002956629357723171}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.12171175588166115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020194351189424006}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.02010129949676097, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009300686699073729}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.029441824320720608, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012058268535209634}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.020068262069926297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007221000793102024}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08953936172947453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017775315158264063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.13029929545942714, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022970146784003155}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09159725836407859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014444402919229701}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11070722310951053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002166951419812563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1588130948616694, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027649492276086316}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11379947696823768, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018742255655409391}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.3731463518427973, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08696587018606129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..81b09c2911bc24306114ded9d07cc3e8ca01bab1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.03293908776302082, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016730427345075334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.04519023931424098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021035039013836245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.03210009752200412, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014634312870140265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.006510228955843905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006254896794491436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.009066760076675555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007714841572782599}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.006176815477796111, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005244170311147141}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.02522752422399141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012818467423676987}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.03584434932056522, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016868429030500354}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.02466933947334243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011019585144221695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.03087353364690276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001563351432924281}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.042420874912671644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019707727296120795}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.03008730837903098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013711177420732026}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.06336544092366646, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.006198900880420144}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a74c336716acc43cbf90e627fd5fa5c2e0deed23
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.0028814261031211595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005460823382341378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.0042379170709383514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006322445256233903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.002942517384632592, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004530137775198365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0007022578331392375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003456033604074101}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0006207799035616884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00014464546198768093}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0004737386445928847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00011428924831347669}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0020692242308215287, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00037071046415170156}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.0032284384153396534, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004792314433997919}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0021726977472457452, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000323151520869407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.002690957058200923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000516432539920016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.003973277793754405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005917271674956313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.002751073118371148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000420923652085459}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.2107162453936286e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.3979566517499855e-15}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..51491f4b4b2fc6659a3cdabfd937a74d175b24cb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.322, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996659}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9de2a0b31564b393957dc9bb135e4205dfd3862a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.342, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01500870618212173}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f3c727d3d9b918b4aaac51436ac451d25d180cb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.369, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015266698139154619}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.355, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015139491543780532}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc0b9da01a18663f2cbc25100a51163422b9f994
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.37, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01527525231651936}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.374, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01530876736900636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bf71e7f1e85f0e289dea6479f4008d76471a59f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.354, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015129868238451773}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.354, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015129868238451773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..97259875242a0c5b48c8e2904d43e81f76f0a16c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.371, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015283736211823188}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.351, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316403}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..280342a39f3040fdfb49dafd29a81a5885595771
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732953}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.319, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473479}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dcd7eaf07361d4606b46f30eb5d5b4d499832b4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.331, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203934}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc04911523320e7e1e3c59544ed3527a31850139
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.353, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0151201726054837}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.353, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015120172605483696}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ad17dae01cf37350a2ad973c031872088f6a2f2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.366, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01524061272640575}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.357, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015158521721486767}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a6abdb66b6b65740902d94cc5ff5056e55785ec
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.339, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01497675877162034}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.346, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e963d6fee56c94aa6b9ff22b0c55c09e6f222c0a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.339, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01497675877162034}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.346, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564443}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fd44177de71590e9b416ec46026f24633e7e3ad
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932573}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..438d9dc7209ab3e998e79184c1abf039ef2ab4e9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb9cf7504ceb54a0213e17b623baf1047246669b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.349, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0150806639915631}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.318, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..649d1f7532c9f72adf699b39191b95b5b834b6a1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.362, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015204840912919498}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e767596c94ba725fd82fe8e5505f19f4a259b3b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015060472031706617}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7d1b40c6282c6ade85bd76da7dc5bb40ee6ebb0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792511}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebe7b3eb6a7a8b1c32286d4e13e79204a90bc9e2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811476}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811473}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..76927cdbf23853a9c16c42ad9caa4038347ea613
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.349, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015080663991563102}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.348, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015070604603768408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..29583de21805c79fa115a8d355199cddcc5f4c19
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732963}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..716ea808415243005191b161ef1bf7f563497b99
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca8d8e0398f7359105fa907d1cb97713de54ee38
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203941}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.339, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620347}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c770c7907967560e991495c3a23c5b2cbcd933bb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811475}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928357}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..38ecd6c7715fe37461b9f54870386fcdd434a1db
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.348, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015070604603768408}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eb6e6992be323c57c094641f9c817dacfa9bb46
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eba8e76536602b244485760df8489e4a6524d634
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.358, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015167928865407559}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01500870618212173}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9d65ed00215f4b2fdfa48a3d822e377d3e03994
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.358, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015167928865407559}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.35, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015090650341444233}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f43ccbf49e9063ca1b4f09f67b7342a9531937f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01492201952373296}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6204d501b0fa8b1807cec1e83182e29cfb1b0966
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.318, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c02ff29a0eab5c893f4b525bfa6f3e5c13eca96
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.373, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015300493622922809}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..03d4048927a7239a24d3e4f9688f23289505510e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b843e476922aa7618904fe673e5295de96f3311f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.332, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01489959724281149}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c20111d3035c9c59b9370d46f7763c577aad2b17
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.318, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..45c0d78e08ad4a0c74f50b31047f252c9b159f18
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095526}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..23d1042d723ee90d45a5cf44f20111798a4d240e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.322, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996666}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01484221315341125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed8eb4a9ad1c41c8c3874cdae07c6c7832b31ea4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.313, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977883}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f2c2bd0f2042079b8a8d5d849d605503f1e3769
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.313, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977885}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d5fc86ae6e9aea6e9bfdb237d3c5e00bc610b74
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.32, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574886}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.313, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6129e3b4b9c50d82ac8de7ec6ac2e25a90eeb53
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.299, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014484778521220468}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.304, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014553205687950432}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3812233461401dc0514eabd4e0ae60bead82fcb3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.305, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0145666463946644}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.307, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014593284892852625}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff8b65d85c7f95b5560c5a3ba4d9f30a20eb7c2a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.305, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014566646394664401}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.316, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057111}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1958f3c4e48ec18f940385ab764b5037ce25520d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.309, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014619600977206491}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..01f40115247c78f111c0b4bc4ae3b87760db5b72
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef0aa93cc75848fa84706e69f702ea4805c33225
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.303, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014539683710535233}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..31d8b299bfaacea2241058ca77a4c1a953d3726a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473477}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01477082181793465}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7114133ea460fb6dd93627231095ebdb1447020
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.301, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014512395033543157}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.31, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014632638658632891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..92282a246528bffbcb0418008c4b9a9e480a416f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.319, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014746404865473482}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.307, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01459328489285263}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eea7187a37578424cdd3701e821d98a343fbabba
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811483}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..81f725f2590df55f11ff1c387a7e8231fc4c07f7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.297, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0144568322948011}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.302, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01452608023545954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f8a9ee24fdce5e3cac69ffddb1f76a19d202124
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932579}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229875}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7fdfada3ae71dc66178bf1b2e14123a95245358
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01491084616422987}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5116b9c5c272422db9a96ec0a80a9c3893dad811
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01485384248727033}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01485384248727033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..933da4e4efd3546d65d1c054d8f85b48409144f3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.327, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411245}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b141268d87ae06e803f99b6207fe1af08b3f258
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087955}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4861b690b71daf58b9ff355129163aec4dca4f9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f1f82d00d0c9d81ce7bb4733fbddd0a35fbd4fd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.311, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014645596385722688}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.308, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014606483127342756}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a5ee08716b2909bced406afd802266d36ac6fe8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01470919305605713}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..92cdc1da29c2cc221cbf94ae2ccee40f9a14e78c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.299, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01448477852122046}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014683991951087976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f63275668deaa3dd4026725ccd809947c8e79cae
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057127}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.309, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206488}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca6d698c5a7ffbec91b885e4d40fccfc9a59481c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013680495725767785}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.35083333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013782212417178192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1593652fc9b3e1350138a3206bd7997139d07781
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406396}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0136476029424064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..38021abb50f72903e81b5fa0aa78abc67ccef365
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406387}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01360541734571053}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5068e33954205e6762475b7e936635d71970b14
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3475, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013751753243291854}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3525, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013797164918918355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bbb5b52bcadbf372027000434c10930bf11820f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3433333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01371263383046586}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.35083333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013782212417178199}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..09626e093753823ff52d72b912a7d3b848c8d7da
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3458333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013736245342311014}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3425, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013704669762934732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6feb59e70e2a571115b0ee25747f0b463f51a3ca
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3375, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013655897185463665}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31583333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013424568830356446}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..85edd0ed3f178b3e8015a404ac0b3758921ec3ec
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932889}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e96b84533257db5b843776eb09a6126edc729d27
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3258333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013535422043417459}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013570806258433628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bd21032ff44c984492b31b4c1aa6742dfb402d6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013452948996996296}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3233333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013508372867300217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1638d2b8f8f0a1ffcefc6e3db1d61fbec3c3eb61
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31583333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01342456883035645}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013462309712005134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..72344da3ad6bc02bc01401c6f4855670d4575852
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3125, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013386029277441229}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.31333333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013395739415639082}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc19a97c6e03f3ed44220ba62cb28116e033be83
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01357953127780092}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01363087184382147}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3c005ade4ed9b13d9345384ced25b6547bc3659
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae59879ea2251acd26d7c4ad39c2ddfdfc8bbf7a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32083333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013480882752851552}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.31583333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013424568830356448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f286194892595344ab8c62ced52443a584c4596a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013696658778002515}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eae62f787f52c34963c50c1a760c8ddce7629c81
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32916666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013570806258433623}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3475, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013751753243291852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8901d0e3a9b53be7200ae5acbfb5317b1a887fbb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32916666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013570806258433621}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3258333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013535422043417455}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2347b0955065083e33d4e64d4203cfd16bfdbc6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3258333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013535422043417466}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013462309712005124}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..af93dd3c1a8cebbd13fe31acd85eac0142c60460
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932886}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3416666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013696658778002517}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7f7aa24e0e99b049469d4aa5e15a21877ed3390
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529019}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881634}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..51736f338641a64dbb9d5d600211f2bdd7213570
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33916666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013672343491681819}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3383333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01366414400661827}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..075aabccf5ac3e3933d3749065db17c8234ce369
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01360541734571053}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2629644c5b7fce799ac7c194272695a6e0c77643
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33166666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013596836729485163}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33166666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01359683672948516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8854717de0a20384f5d8a7d7dcca87bba7f210c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.31083333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013366457845965442}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01363087184382147}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc14acb7fe62eba46385145038fd3a81d1e6fc34
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6a3eded3bfb423f4d2314db8a92a591374a609f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529019}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881634}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8e8c20f439a4b3be28686ada44b51d529bedb1e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013696658778002514}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01351743812088163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e649151995cb8d502923f4250cf5347b29e088b2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3225, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013499258621103245}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8f3db4942dad1240301875bf97b560d30d6c1b4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013452948996996296}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3258333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013535422043417457}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..66a0dee9932510314526001c614be9243799e920
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01249146853239058}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01249146853239058}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..835787d7284dc33efebc58166582de5a16673a23
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01261035266329267}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01261035266329267}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec2e6e54c77c9cfe6f4478a9f6d10757e06167e2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301836}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012414960524301836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6969ab7f9b13b74c7370e039bc7394554f3921e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042967}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012682496334042967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb177e298c0e45ae70b97b92a378b0c29e066a5d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012430399829260861}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012430399829260861}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea8478f38fa69a8302c6200de2d7dd69ee40218b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22610921501706485, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012224202097063257}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22610921501706485, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012224202097063257}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bf482239b7fa63eeeb0495cfc6cb5b973c777c5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.27303754266211605, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013019332762635736}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.302901023890785, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013428241573185347}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a85b87ae4fa00180741c6b3614d78044eb2c542
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01294203019513641}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2960750853242321, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013340916085246266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a132d9231a55f7894b35d5845b74819169a716e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2568259385665529, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012766923794116801}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29266211604095566, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013295916103619411}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0274822083bcf2cc7cff7b129baf9ab77da693d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01252159329580012}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2815699658703072, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013143376735009009}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..93467bf277395d21cc5d98bff3ac39b66c05cd79
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012696728980207708}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28242320819112626, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013155456884097218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2580b4a7c1d63388130d3d8d498eaeade08f341
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012668198621315428}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..914fe5916841d82428e707ebee50d0e711eb695e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01261035266329267}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012902554762313967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dcb628172a90c23a748a4f71cb808783a836b88c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012430399829260842}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012521593295800116}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d26130834f738457375d43c3edf73ede84b677c5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2226962457337884, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012158314774829926}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012581033453730113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7a15295f240de2f66d5f1e1940547e9ac1afecd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.21331058020477817, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011970971742326334}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2235494880546075, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012174896631202605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..63fdf16555011bbdc377292e25c6ad72a957b61c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.22098976109215018, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012124929206818258}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0122728535825408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..500b418dfa28615002e063c546d3c958c8668e5b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.22610921501706485, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012224202097063276}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012383873560768673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a91b24f3070b40e8433015139607662a2ff38be
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012521593295800115}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012521593295800115}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..89beafdec60219585a21b1b2981c85b43fd7de94
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01230492841874761}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01230492841874761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ebe036f89178102ae31a346bb93067f70bcda82
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012581033453730114}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012581033453730114}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d63403118a9d98389e33f3ae96b111d083e884f9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012639407111926427}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012639407111926427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..300a3e15e59c5d2aa898a25c2e339f9715360bbd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01255144762785626}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01255144762785626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2afa933f29ce0e876b649a7c0e7cec475fa7441
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24488054607508533, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012566273985131353}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24488054607508533, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012566273985131353}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3959e7801a3a8f6e698bbc89475d15b694cca416
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25853242320819114, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012794553754288672}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30119453924914674, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013406741767847619}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d884b4df6441e3f75e27ac80b8d10c32b8def01b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2636518771331058, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012875929151297063}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29266211604095566, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013295916103619413}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f66667228db117353415507ad2e621a32fd21028
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012653835621466646}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2901023890784983, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013261573677520778}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f392ddca752dd6ca42810fffd87102a544257a2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012581033453730106}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2883959044368601, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013238394422428157}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..62d6472184e30c81f9c9534458528ad0609c328d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26109215017064846, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012835523909473855}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28498293515358364, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01319134817983879}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fa73d637119e047f9ebf2aaa5275a1b7e275fa7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2568259385665529, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012766923794116801}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28924914675767915, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013250012579393443}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..557e69f5511f7132325161c7990065f4f924ae9b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008885233166386385}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008885233166386385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5aedb0d7a2c3d4507e1fbe528f3e6d5eee76e3d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23695286195286194, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00872518926147229}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23695286195286194, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00872518926147229}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d6403868cff94f666c3878c834ef85fd36d174d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008850055161459239}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008850055161459239}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4dde2aa133c8cf2ce5f1ca89e1b34e1e0e83bfb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00885005516145924}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24705387205387205, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00885005516145924}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a3c462f1b7d2834f458815155b693f33caa98ae
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23779461279461278, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008735850753507992}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23779461279461278, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008735850753507992}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3588548bcdaa003ad118de511f1a54d4453025d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2516835016835017, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008905088235948768}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2516835016835017, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008905088235948768}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..469f262282d4bf84e335c7b6e2ba96696a0ec111
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3531144781144781, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009807078935467608}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30723905723905726, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009466688832475374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2a0f8548bf9ceecab45c764f74166feb341ef02
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.31397306397306396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009523245335215511}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2967171717171717, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009373559492986842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..20bd6ef203a85bbf2d87853ad8c84c8ff795a952
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.30976430976430974, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009488172851903717}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2904040404040404, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009314833302936282}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5447e5f6c989cb6247b2488b8861ffeba12b59ee
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.30934343434343436, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009484615220606826}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28914141414141414, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009302827114597425}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3092f999ba574ea1c7e17ecb9e84394ec20605ba
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.30176767676767674, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009418994158522525}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28535353535353536, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009266280584997755}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2778bca406839e2fb862ff00739d9371cc6326a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2984006734006734, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009388855914040432}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28745791245791247, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009286682281593418}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac57902fddab0839801c8e957333e85800991abd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.281986531986532, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009233124071053648}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26725589225589225, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009080463246017469}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0385bf1fadeeb52ac3e26aa8f2bdd3590ee82c5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2756734006734007, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009169229476542563}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27441077441077444, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009156177122244525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..936d25468c0c203516129759d246c2927ed2d8d0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.28914141414141414, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009302827114597428}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.28619528619528617, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009274470774627718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..172eeede7d73d4e3fb17694f590f193f65f94082
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2824074074074074, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009237303403479344}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2781986531986532, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009195059601583901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e76be9dbd204f34d467ae372171ee9225ef0e6be
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2748316498316498, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009160538115254961}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2769360269360269, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009182190173795889}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9dc4d87c2a8113ae49b98bbc741c7f3ace939aa
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2697811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009107527914671064}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2697811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009107527914671064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f656fa101c839b28b99ad949e4b6fd885fa2ea3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008910024163218197}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008910024163218197}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5ecffdd669c5e49936b0d016a315566a435ad92
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23526936026936027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00870372426971864}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23526936026936027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00870372426971864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..599392965a11431d2bc7985ca6d5f3c7cd667c04
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008865199020660961}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008865199020660961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c708d53c32956524e9f0e66c67bf9989dfc718a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2542087542087542, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00893453768114154}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2542087542087542, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00893453768114154}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ff403c9e5db4e569f364149ce27d6d771db63ee
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008865199020660961}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24831649831649832, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008865199020660961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fcf1bbeb47a4597575dad527e9dd891f8f31dc44
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008844984581934907}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008844984581934907}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2209c4f152e9f796572d34ee4e1ecb12a7abf716
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3383838383838384, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009709034670525097}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30723905723905726, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009466688832475374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3316b584b4ba7e1480585a1bba5b8669f3f20167
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31565656565656564, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009537019245566084}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29292929292929293, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009338583737393599}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..98615c53d0daeb616fcd435f490f2c4b60691a7a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31186868686868685, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009505823345817652}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2878787878787879, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009290733161670164}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b34d964fbfa6a55dbe3c39ae91168c4734e8591
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.29545454545454547, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009361987126556453}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2786195286195286, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009199329195026352}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..114b7b685e2f0abbab5c23c92316038b0e09c2f5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2904040404040404, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009314833302936285}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2824074074074074, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009237303403479329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..18e9d7175a97df3c5009d9e5d5e0431d35ea49a8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_arc_easy_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2908249158249158, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009318815921176655}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2849326599326599, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009262170695590658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f7fc7363e4716abcb7294af97add0ed56fad918
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5386666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009102888762598247}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6063333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008921375326707084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b64c86fa177204f501159474362babb8105dac23
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.538, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009103824830376472}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.588, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008987709736566395}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d291e41f1e15a3870f589673c7e4953c0adaca61
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.541, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099483512819306}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5856666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008995223478188031}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6068f7c011ffc640220569e6d5bef0ce6dd2a0c6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.541, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099483512819305}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5823333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00900559683375783}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..72952ae5c6ab5a3b451b377a4f93785adbe2ed27
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.54, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009100967487199723}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.568, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009045400659508363}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..61211d97a6f99483dabbadd153e6908f7f6f2e58
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5223333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009121118663627248}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5736666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009030591966818142}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..faf5729a82dd48b6a9b1379a5f261aea02a74043
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.59, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008981103499757514}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..49b7dcd8cc4821f0a7eb7e8468d75fe8d8ab5deb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5713333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009036836097555083}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5653333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009051951785603835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3e433532712cbf8417cdf447bdc62b7ca0dad8d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.578, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009018450207660424}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5713333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009036836097555083}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..73561ddfd47e6e3525402a16cbca868bf67c7b75
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6033333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008933122315228996}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5963333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00895916952266258}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cc627cc807eb3b0eebb15c3e8d202e44ff4e9b0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6066666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008920048383377188}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008945762994765773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cd59927a5f18496ee909d5795f5de7bb6c94f29
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_after_reading_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6103333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008905164372580984}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6023333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00893695992571691}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f90e9f9f4bd6e64494fcae5d607459e2c1732a54
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6233333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008848110494114768}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a4b6a32b1b34841842a27ddebdeb70afc243936
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.609, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008910637827273029}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.601, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008942016171856502}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f36cdf6ae73356affc41aaa2c2aa29f22515fbff
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6073333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00891738144014832}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6053333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00892533006683219}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbc1a46de6560469b1e89c9dfba1421f136899ba
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6056666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008924016166504414}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.607, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008918717088507559}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b9b95457059fc64de6d1261ca9384b3a17d7a1d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6133333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008892593055774285}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.606, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008922697920438162}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..09a9532c5885efcb187be95a0d0257ef40021b08
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_exercise_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.615, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00888545536850563}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6103333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008905164372580987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..27a6708960778ddadded92c00bc1dc247905ded8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5273333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009116578321738462}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.39966666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008944518370322185}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..82d5a4b0cd96c3c37207a17a1947a184f36420da
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5763333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0090232041691723}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.573, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00903239695383109}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0488da0ad923b455b2f8eac0de8f240950e80017
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5823333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009005596833757831}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.57, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00904031207504128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7616c17eee67ea4fd5fb4ef84b6da861c25f689f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5993333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008948239303079452}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5976666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008954354670397112}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bc2d6939ec974adc4c19bbb4d8a5c2064dccf2f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6056666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008924016166504413}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.603, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008934405848700122}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5272f1f81958ce38197cf206f58e5c2b68cea53
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_valid_binary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.6106666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00890378508047089}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.6066666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008920048383377182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..525fa7f9a544848641b423d5c7c9f1ddfa6b522d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..05a073341116b5767c8592a3d3a5046b6a017ec1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcf129ac68ded27f3a9c4b8d965d17e3cf2e4575
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5943333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008966262991425923}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.595, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00896391565823638}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f623f14e07277abb20e915fdecb5cc393c86c12
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6113333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008901013367923425}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6116666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00889962094339769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cda9cb70647fb4c562b0251f01901547ca733438
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6176666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00887380602276318}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.62, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008863380835773165}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..89f2241c530a3c36ea01ba3cd437230c93a1d210
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_boolq_yes_no_question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.611, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008902401412932073}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.617, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00887674483503322}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0399aae0d3db0d0bfac7d59a05adf8d0da1b0db5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813057}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.23599320882852293, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b852ac757856f8dd2d89ad9c6486e2fcebe835a2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2563323201621074, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a0b14b9bd38c325f6495ca3a7a607226efa4340
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.28166858017604285, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3ed4e0ae76d321938e1007c40e493679bdb4027
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.20038220038220037, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..63ca85ae64ba86e09e986ae0e0966884b336abde
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.18421052631578946, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6e86a4a8034050fe394e61fd567d3c867b68669
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06460957383809221}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.17777777777777778, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee290467e6a82ebf7aa720771e790ea036d518c7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc3bd2e8a7936e03d903993f778f8aac3261066c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9d5f2155981eab613e2769a48b8cd6322176516
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3481187642745522, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a26ee31a967f01810aa6187b83ad3a4734755e3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2515873015873016, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2020732c959ecfd28bc7c043335724200c0f5bd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2522366522366522, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..21a5f7279711c7d953124dee5e87e513449d6fd6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.23566182215971246, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f356b07aa31c05d738aafdaccc13bea90c33bc95
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.25, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.058387420812114225}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.20014245014245013, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d996809b76700c509c9e6f19a579c981f7c1335
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d434ab68ecc19364f1a6c18c886b432248e2d4e8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2642526964560863, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd6bdb905e81cba9278ea45f215d8150596422f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0663363415035954}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2858641489640703, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5610ab643c569ade9eb6debb76330fb4923306ce
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2222222222222222, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1a735fd20ba1e35c09a9a30ae07171569368c14
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06585388898066351}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.26157407407407407, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..445695b2bb7f7c31afe82d3ceb1cac9e5640ecc2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.30357142857142855, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06199938655510753}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.25353535353535356, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a3f94a21790d8730c1ab5824a8ae9ec042964b7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.33654945683247567, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa827f36bcef043307dd7137fe4e1275b958fa09
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.30357142857142855, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06199938655510754}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2959792578695018, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c8ab76b8df02e4e1ea5cd776b7fa60919a64aca
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.17857142857142858, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05164277182008721}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.1712979526933015, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f7b70511dc4d93f816bfaebbbeb3f0211a87436
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.19642857142857142, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05357142857142859}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.18839196978731862, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..49dee3c63eb802f6fc83b9b1f83a3844acbca040
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.23214285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05692939024000109}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.225, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7cbe4e3b12d2392c37c9bfb3e7cd05f89062f77
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.21428571428571427, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.055328333517248834}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.19458615016659064, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f3045613e65d0b755d3ba7cb139ccbb762e0943
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e454ad74cc5fef9ab74b7ae97e439f91a86766b0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2833685198217218, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5aeab2541f7a7caac2eea75ba89946e532c7e34f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2960755091902633, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6710231b417ebf2ac7f2e58192c70e4bece08da
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2565284178187404, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..656ecb5a358c5acea18564188e11034ca95578bb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_cb_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06585388898066351}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2676134781397939, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d789c724fd7e116b9dddc6588269b0b3adb5598
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050161355804659205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f595ae8fdca4c54eb0c71d9999c395a880dd086
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..75d04f669c59040b4aa522ef1038e5c4d0ec6f30
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e647ded0413467fa71e41ad14aa7bbee813ef97
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea72fa528916a2753a478e7c2ad6dbc8993d01ff
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e54dc9a238276922587b6290ce29542e1862ee6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_best_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237101}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4372223f253f9fce100b1121f03a923a9d7e59aa
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..73bb291f80c8721c5bd48dc4999ccacef4e2a583
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..939d26f07835f75feb9d56fddee766c65dd81aae
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d9c9de9e32f35b3f5827ad41e2c378f7b0495b2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b54d97dba8e7d4deb3e00581135086e5330b323d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5aace4b5a2dd6e63d1be7fa045876a1bae651413
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_cause_effect_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b7b313967e621a253f8533f1d4ccc8489fe0827
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf9e1af679c0f5775a5151239fa8b10b6c3a97c6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6361605efbbf8a2d9a785d0abcfbc0083b47ad9a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048783173121456316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c07fae5de6f64fad15cd617e7a44d995a46fa82b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca0153b197c74f7ce062540868d69b3885678f39
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dea31d6bbeb90df06128d6495076d1ac1cbc20a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_choose_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..66183e390c1bba6c471252fdc371614fe80a6fec
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80b542407cea042ee118a76120528d1a7646f6a9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..043f40ee3f9374f874c58e0bce336fc48ffaadb8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4d4c0e9015b24e227dbbc14fb02eabaec66e935
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d8e7b2eafe40af590b799c1180988e4cf845568
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca625418f9d8d5e40c4109c3b9097490f6808b86
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04975698519562428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..28d4e5de1e9d72136533f1c050db9278bd37f0ee
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050161355804659205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d914c87acbb3c823470219605c2ab1c3d362b639
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c622e87d89eb4817f854a79722872569348eed8b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4632426e5590d2e0fe970d14e28eebb2d86e3c2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cbbe3192262eece70b5badb813e352d431dc00f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..20b26587cfd0d2614ddffc8a830c67992b72d64f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc985b2cb27b3cb1568a2f9ad1e52bc2d261fa0f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 1.6981740230274491, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05639581747200235}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.18461775373723568, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026005533202261635}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.2709421058746983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002901032838963527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.21306073373162734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00260469936196386}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.05762170363834695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00144636757600207}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.07665802406321871, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019164632387191742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.0639746559185916, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015730884231270822}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.15100987885580808, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001847913237566943}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.2275598490119157, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002218584984658347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.17623125609468054, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018965000242015396}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.15227707203191942, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020577913588062102}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.2249114067227135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023015700813454336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.17615894675748597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020576900261863084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ba972bfa31787e2ea2af574779fe742af014f93
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 9.96698641896448, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1990330923860415}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.4609965771337017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004586076601095137}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.37987535167878217, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003962232318603492}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.3948345170666501, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036620162579725215}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.21903816752228814, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002976279745126517}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.17782940012286022, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024379419479102857}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.184899842771481, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023385739986275713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.33718394432768894, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037312597707337756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.27549038942825305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003082991317914591}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2868315090359682, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028875362371707236}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.37936486415476367, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0040948721855691185}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3106003498059364, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003426475280011305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3235575895067646, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003224417319481092}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8fa1671604a51e509c97e1e0ea38f7ebf7e311c5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 13.854353887594359, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17297348749849764}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5636886053242238, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003365909868631314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4600628012976699, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029921086682403184}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4817650325322006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023949981341780365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.28301589520613757, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027664917362106947}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2276135952020421, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022770165359506478}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.23835835323959847, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021017748917656984}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4181431608727368, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030295159480321168}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.33948065113070464, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025328414029523382}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3558600551401343, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021673932342781833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.47016389392378133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032388215431738685}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3828571201068127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027788968245477737}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.4012330801491776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023666952140306693}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c9b3b043b7495f347d288752ee8196c196a3468
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.583942693396294, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07837319643131219}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5820358474077353, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032331888050979754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4693874389716453, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028872817993720653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4949134231803726, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002236475337599633}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.29841477725265647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002756754774322472}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.23792651366449613, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023167538990293366}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2504276350046371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002099307370130092}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4315178078770954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029957554213462813}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.34591473020366464, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002476997441911286}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.36516427629163034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021003983397447686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4875277711044115, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003185953377452879}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.39272527260887446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002758966209609139}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.4141653781843093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023176252522451174}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c6c179216f9cbdcc3f496d01c9482e5fa2da470
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.709300883445545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13909289849143106}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5842200644947523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032415680641664523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.46707140188114915, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028634798993105617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4953743872583611, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002262323452189699}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.3010808279506765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027909138698240248}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.23765379325641803, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002313565104143807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2520751492512158, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021451907698192933}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4338235689405383, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030183502388625087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3450632042572478, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002512770187116409}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3663278082137279, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002169786461353247}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.49060403536248004, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003216596887763901}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.39198654946334316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002776695156562603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.4158359809120709, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002376981852382139}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d6444a2eb4d417b3ab950f2bd4d960499d3c904
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.508027202695663, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19729885956116464}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5827447003386718, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003252743704799596}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.46479474176199786, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002779950681422667}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4939375271803598, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022129580308727064}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.2992451000703914, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027706344768827404}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2350933451860936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002225186354361509}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2500737308803256, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020834454268491196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4330121131753695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002975630807020434}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.34399062086892157, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002433479099663889}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.36577122165110715, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002101489877024906}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4903224173893306, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003175989510755157}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.39107749287557136, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026928788465534173}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.41541912166430456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022884279022857587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f92198443914a9f63b24b8ae5a6c142d7f68fb04
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 3.05988651777403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0810296453144049}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.20519732124109513, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00270006442645752}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.3468161863236788, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035404054532813803}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.24909819431786423, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002893624823392621}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.07031878918831437, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014268262802144133}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.11439603360394333, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022184263434388354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.08417727846653551, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016451293157964974}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.1598309675588018, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017331961057331041}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.2787944514418254, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024559130158891516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.19628984368933966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018722658911640626}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.17891503505698214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002513816210748885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.29862293188810446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032327191084023034}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.2161607467623904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026884533677181723}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbe602e1754c56726967400a6a98cdb441d17d61
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 10.006682639472844, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17702306891001293}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.4789153736629139, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00448976955675989}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.3830905201721848, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038258847063219895}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.40374325026432717, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035397356280531637}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2271866148988451, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029128947832580934}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.17899394069939906, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002365623843266929}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.18891534472001092, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022875763676466687}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.34845116104667717, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036371332440899083}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.27628825839114474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029710867029376733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.291888121149291, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027977041936955364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.391979289276976, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003999654789641658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3119866289090268, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003317127496333722}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3294523831273175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003132967917625185}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb578b250eb9dfae779f456acad0fc89ac8347e9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 13.533351780962887, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21996557214666623}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5637517388847328, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033693539461132804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.45574622052482294, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029871424357699977}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.47993566559272743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002412869717997041}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.28114460801866314, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027675305939149214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.22386112791717722, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002265557627345361}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.23583645689850805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021064135619051727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4150907072454883, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003041198860592472}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.33333794137558537, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002521898919137688}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.35155736822685685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021818967091197677}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.46692814202061905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003234906975148213}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3767319992756395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027905499560689166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3968966833492359, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002390947561471317}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..44bd429c71c210d268822cdda5225eccb75ab012
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.392809420394265, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12682420333047015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5761697396711711, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032085414101804934}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.46910207675338633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002885851272557091}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4936453397657041, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002268512440380754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2908760217107419, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027504762505249206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23472065889957064, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002338453487460064}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2465503719143272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021497257737069504}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4238930690015189, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029754914331797584}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.34332965676676525, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024918547413801777}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3615814515575109, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002132135118187528}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4791873227267321, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003178341414436254}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3893636074797145, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027438228796168866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.4100151570685983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002345807247704266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8ff4f83a2cc85b0690fdccdc4fa7f0b0d5341e3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.793650168308517, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10622251877201895}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5749775266346093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031835883766827408}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.47259329496156155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028220142954102326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.49601283925081735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022190615188824253}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.29266983484455455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002724658955172727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23756368274974432, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00228326458262002}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.24939935966748733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021121154249644186}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.42468886201366246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002911495939193518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.34812073234739377, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00247643056659921}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3653771997217671, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002109257136196024}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4796086565780972, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031243508690243214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3940103192114069, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002715853863926917}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.41357826323193164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023087953273462436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b25b9df2ecc82be2ff5823a2ca17845da534500
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.845898823514458, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21800724630678245}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5769246962013296, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031674714789666314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.47417689233706106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002773164930669781}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.49837978991033016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002207597586987178}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2946768385756233, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002744759884705772}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.238907636637235, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002260409516558731}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.25132933058908946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021206561166708855}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4261285955236887, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028653910956793834}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3494535705791258, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024239805462257812}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.36737081391649246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002093391008970402}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.48266010526884195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031168143988746008}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.396635855622698, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026943167663674044}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.41688142252277666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002316312633300744}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce1e2ad6012800e050e39f9b0a43980e2724d9e1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.43088392670710296, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03961518900803983}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.12074617507907323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029749213887964613}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.05686771392013282, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011647167758256662}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.07032991515919271, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00136157455736835}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.030789282891283725, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001109990652325929}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.02001834529014065, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006990709895113358}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.023394283471936717, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008044627503034823}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.11414146607161421, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028337136738440204}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.05338660372411939, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010814515894240666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.06604473509926488, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001258776154468613}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.11860380990067865, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002956819438605847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.055427618982087014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011338872506724367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.0686478664349337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013288862391063107}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a1b809da2385e461d995541e261f1a7594ef624
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 4.2489438945007505, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10848523635627934}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.2978880326400029, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004314734059526238}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.20574103567352484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003452156469381813}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.21799479871592295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003169109132023807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.13974765472529488, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002984069627406084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0902480509784589, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001917484854887988}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.0954256009916435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018233170834364776}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.2537181223201809, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037112591599500954}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.1665688306872857, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002559263851460158}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.17908060542609147, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023465508632497206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.25974058488564317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0039990036630274265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.1732089631652664, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00290503437818348}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.18500301101744382, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027026113593496136}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b6f2d65a5b6726c0886d70234c01206d1569f23
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 7.897479428851062, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19090150792663044}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.4097753151220947, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004759830299635232}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3050743649751446, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0041553698725291955}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.31691617191669696, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038078275455770048}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.20566849161886339, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0034162536009452013}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1459179094411777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002514675912366735}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.15116199103567896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002362966831815902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.32535955342181233, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003997229419498239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2316744800368858, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030801010270189366}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.24261367518705662, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027886284095646483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.35123874989098963, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004372358007688104}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2548999077417509, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003549861311933047}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.2659220289237627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003275917207172722}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0719e126652122b564e506a2f1352f8dda1e109c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 9.864961497979543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1317560469306074}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.4501732457506971, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004734510768497693}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.35400553421574077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004181631727240358}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.367082544858982, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038803792544539776}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.22465174511925462, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033633484029790346}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.17211207944695756, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002640032098747926}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.1775904737014302, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024963206617874046}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.34368601437901597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038108212641332403}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2642268452145586, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003154585439683898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.27445570312431083, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028600187937791646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.3787563012175693, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004249607492303315}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2946889001020606, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036226116144742046}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.3057169456441104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033589708748168147}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4329e514ac594ee3b3d8e2ac13e905b4bfb73dcd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 11.504942043363393, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.26131157019245693}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.4793166897526887, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004447515176857311}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3926779566427441, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00396929495072039}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.4066062429397127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036602506735932336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.23917119853723076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0031916812958254622}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.19316318038199604, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002618076681843197}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.19940880805541641, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002496228569517145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.359407103102224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003589169466666821}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2902605810383062, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003028323235467514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.3010015496850165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00277134061669236}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.40309118710355224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004055402595432995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.32783184069905075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003485623775598431}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.33985627414169595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003253456875343155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..672234e1b1c40bdb79536efed88d182c5bc91b1d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 12.37094125306445, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.26637203939148857}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.5019495860499606, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004233425201646585}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.4148487000765162, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003701189400722239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.43197062064083586, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034335264670076996}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.24964365910833303, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029949042783772935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.20417244071827118, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002504613430914506}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.21239127308248862, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024116062520429865}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.3697094271111249, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003357782592019311}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.30372307503085555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028621696126892044}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.31619868839227344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026432709715011175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.4179517060062675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038415175450192012}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.344453227089507, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033031188522087873}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.3587152017326737, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003099503498187738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..832b08b162e0f500f2e8478faf892ce5ada838d3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.23009457158327395, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.032672420546843176}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.046037847651470024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002049984498068587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.034901976047314216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015828043496248269}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.03565768533846172, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015248066303431285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.016516540815237794, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011075729579088504}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.012463074267663072, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007148960008422395}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.012939399202474111, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007270998548969809}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.040481693666315224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018321004056548362}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.03073851262251656, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001408552293064241}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.03123228667584005, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013412818998006393}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.0451064653533249, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020199102881650523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.033817754277814645, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015239128770440768}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.03473298460911775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014863724750131808}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d570af83e1751d95776454e1eb21f3bbb1750bd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 12.089976671939015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.168251102142501}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5731074158985243, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003284340506365394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.44018576325041153, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030131809832668697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4712653059021899, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023607480373166326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.275133195308103, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002740669736631083}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2075323986474321, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002196437540323409}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2227090352718732, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020690282260279855}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4165254470631004, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029803269219252386}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3172191245436191, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002462143659310004}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.34042049528282725, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021081402016807802}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4671938157159783, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003215098649092779}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3576660404019271, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002771243835460788}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.38335331690793345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002363317168332221}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5cf448c352e37e1970bd19184ce1e3b75047fe6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.306767136731397, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20748195293257823}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.586146194582904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003201208783304345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.46837981221534686, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028821983932268993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4959039342859941, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022179464050245366}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.29546214538365784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002760222751352884}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23282858226774875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022643379007857164}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.24660179707103722, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020875846431590893}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.43453348333901143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030164445891002808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34493209177229056, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024926657554512834}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3658398555812216, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021228022591806505}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.48901167662589756, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003188380289601738}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39060971772253245, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00277158241175323}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.41353486620349267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023298219846166635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c26817125f4f76003d7931827c8fd6b73bb34009
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.986652923553951, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14924649133846585}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5852147290185015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003198801112514953}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47309130989416415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002863282453776492}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.49941178939651276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002215810900097095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2992462347854545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00279234169219994}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23892912708756728, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002326244957735514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2520595099329764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002137559032929213}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4362259991876925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002981357963715629}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3508817024714458, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002481255460162326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.37080740120712563, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021085500680046655}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.49286818963774864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032037762403395406}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.398038444692037, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027747754778932373}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.42024778988515976, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023378975699189846}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2be07a6feddb18ddbdb217aad3ae5ac837ccaabf
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.355039272215512, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1397449379312914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5790492697063228, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003163418243454001}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47626870304120433, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028295282078555388}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5006142198930905, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002241219932072369}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2964235601953411, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027302282806629004}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.24084946363934556, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002279965123787374}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.25325717049884455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00212780022639949}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.43244548404877137, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029330079057210594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35437374939057237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002483948940307853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.37278812205664297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002143459855710583}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4902546597328058, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003150242604642586}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4030653858271246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002753130148341471}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.42378128466807036, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023597511181794963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed15f2805b8d517e924629c58c3ca2024e8c7780
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.51316933790889, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14131860296901558}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.578638440566317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031479626596216882}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47971897395973623, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027570588512589957}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5033945465560786, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002201297123291935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.29709476038507515, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027367014589648156}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2428822828451652, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002252237417006984}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2553196283460093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002122343708414921}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4303055719591573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028832325276234383}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3558059120429404, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024373374637128653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3734887555835707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002105068605546113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.48802659063198645, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031316303520488075}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4043415445372068, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002685631053632136}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4244461237028647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023223935862545925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..46d191511e4c7c5321d48167af2f004647214680
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 4.513948362755348, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05324817854650952}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.18083865590618006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011967469680865424}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.39694748186124473, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0022358420017444636}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.2438649285193806, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001416529052761638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.07655778766815691, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007880368223086152}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.1739929714611625, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017881791897208102}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.10422182142495252, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001031777045626439}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.15644810681042984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001029610825920162}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.34537695956020553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002028676271570114}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.2113950802978589, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012433002277184076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.15748716535088403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011172995166076403}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3471846365481522, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022023841121810756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.2126259133049415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013551634598868996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a24eb9c04e5fdb2bd85e1d5e35f171cfd1e09037
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 12.021866804949632, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1701742092769703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5627005067366511, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033130096861858074}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4277249029488458, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029565132907154986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4599877335951275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023317871941812207}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.2752724169297304, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027859200368061665}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2051992569808937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021714381628735483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.22114544896184524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00204967320559472}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4175635896665314, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030646772446956514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.31431498620706183, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024553483011121235}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3388960991118605, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002107043508551377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4639451945129169, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032545362146494264}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3511223032556959, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027254287026514854}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3780995133274374, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023233337570530547}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a743ed324fe4dc0c4830f9e4e968fe2dea46de4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.565233063876459, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1885705674727478}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5837242686841533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033033580242790486}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.44540220460576185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002868658378007867}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.48052681154587185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00228757243533114}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.29915385446115245, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002894369294200601}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22399295380543538, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022420057498064956}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.24202094624513124, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021223172573097914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4399247896434536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003079420403202306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3333920863937247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024659518407944923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3602945422534893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021337702090140783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4898081103762298, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032778129938177834}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3727973962332129, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002724416991600159}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.40249470346290595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023435648489988424}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b474f5e145933644b3fb1ff81d0c3829e1fcda00
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.786595723877019, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16700801225608986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.584661321143836, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032655523852631084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4488272835866765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028798866593971774}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4829072150204214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022431825256339155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.3002028921692319, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028661199259928725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22756205456066345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023122295166738216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.24452076890554655, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021341422736736015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.44001019373241995, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030981399154380532}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.33554664599715434, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024937602834115497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3616068258766326, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002140335910026705}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.49080380963239784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032481844726190176}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3763660815881047, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002741120855013435}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.40495369605288095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002311367860372464}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..62e50c4c2200d326cf473a40545daf61e0001eae
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.740418727123535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.125168592466937}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5845829081839676, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032404783946119488}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4449834221000959, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002791619267995256}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.48272753824834136, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00224712531324722}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.30003225779147014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028079090189879175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22533554720244608, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022354658494476848}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.24445326336589657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021129956156590367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4403309294540874, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030238155742260196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3341276065293173, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024525407514030487}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3627627657925229, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002165015417177649}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4923570162452141, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032397034691861526}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3745534207008274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002675901478552923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4064464779608034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023451178623913123}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..464058702580825a4bb898447d65ed3dd135deb1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.656099683614805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17838730696724606}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.580946458312432, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003238803203078521}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.44425456017925374, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00269888517548937}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4813817328517561, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022006369642601567}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.29685416578460727, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002799255589899561}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22352618776361244, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021763786055710088}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.24244538824782974, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020844313178218616}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.43409102063959965, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029438891102049375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3315067878833193, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023785090846630617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.35914212757736946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002092896469507669}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4862732685071888, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031997511868591664}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.371675962232947, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026174808107856332}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4027374392436005, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002301106330222583}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..426d8382ba1ac57698d925dfa2649074cb0810d2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.10974602642941278, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016265580165918348}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2751641641925161, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003764947977083456}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1550455332379995, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002188766283324984}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01715867033956244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007491894700598756}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.044269433075030035, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019355166036210034}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.02442715447547206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010547549811059692}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08537711294726719, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011493677678798603}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.21573286471987912, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002778168194611946}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12085117006455655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015528468048302269}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08875361641538126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013008002608651363}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.22493786700312102, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003175265648492463}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12575944814048232, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017727719225543287}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.826388517581962, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07659868838211477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d94d7c58c7dcb153fa20ad743b1451397ae685b0
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11235330499116508, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001825855414930503}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.17726301733586447, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003322946482029751}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.12859680838214518, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019817697277987206}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0069436817076779156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005112708010543145}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.012853271119216214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010000150628169575}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.008505040157174723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006200781636943587}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0851208198632456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013732771890519564}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.13231394019781906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00231252669675569}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.09669731285510633, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014118648748043154}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0895178827093416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014331191943847438}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.14231732687022683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002682773387227301}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.10270197895040037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015712774454742433}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.48219162434969726, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07477715903961155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..63c392c805edf0ec712c21b75c515dbeec4cc261
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.1350415941925478, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002492146653666393}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.18324113752555313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035029398409540742}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.14430978218128562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023207134123328387}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01416051725563546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009707528234085293}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.021669313236756357, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013707350830651868}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.015577366772524866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009663775690245467}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.10374039906210167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018277864057013675}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.14027716643303273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025856202484314085}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11055239822249753, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016705958650524591}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.10595316244383789, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001864959233883817}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.14515880876640858, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002838795872969232}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.11349384281168248, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017697409720081902}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.6747495040718029, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05447974028945998}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ecf6ed7f3a43f087bdf3e1c6ff11d563fb4fa9b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.15983388686624506, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003478411038525392}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.21363418868995598, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044052794440733265}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16669766406043368, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030859704820920294}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.026074826848387106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014952152913898485}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03800452360584031, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019746449121423996}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.027764709643064052, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001423724925135889}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.1212242849057896, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026148303202884774}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.16165123522451272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033030332185164856}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.1259471852764778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022633889318015}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.12424904095082498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026319767934252754}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.16847726574481212, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003558450955464435}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.13002186162938917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023502331314551616}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.3490765997572918, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07452514524336995}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa4a10c4fc5b946edec76f794f83f075d3da01ff
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.04907387724300002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003234429189684823}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.05731059029513362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036785998464218513}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.046439855607119385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00284410839477631}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.009713544524476316, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010351152599463742}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.012358529122023391, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012249958901476917}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.00959324918731049, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009409652948725221}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.038493398405894705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002592388373637889}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.04381815286105373, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002801768957069223}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.03582073650035805, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021980490984669808}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.03939008585534109, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026368066863757503}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.045440716368802495, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002954025403461032}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.036871071543239965, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022698500974961127}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.24721291868418202, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049588019713429736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cec07ea966c7363536b5317cf877c45f7fc9387
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012123554660875486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 5.723912790464725e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 4.0479309228269615e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.00011077758719268152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 7.833875430317281e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012123554660875486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 5.723912790464725e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 4.0479309228269615e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.00011077758719268152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 7.833875430317281e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012123554660875486}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 5.723912790464725e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 4.0479309228269615e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.00011077758719268152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 7.833875430317281e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c744616c0aabae289b2189c3e0658d6384a817c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14892453306484066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019184491411793764}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3528941190068972, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043118852716188955}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20671568072139454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002528806889301289}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03386350876730052, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010981805988616712}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.08363981215451943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002723064071845637}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.04750004978424984, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015203677000594135}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11077982199983927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001386865922889612}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2637562201881419, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032521028737830106}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15388992488912057, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018329285326789228}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11844827404779884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016239153417563323}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.28238722274619205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037749607667179816}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16468265983100272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021669575359935553}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.875432863383546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.062082888497796286}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a66e9ca708f923462479cc651c6089b84a1c6c78
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.19960964828555527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004107575752866345}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2157428016298934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004055779615673104}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19119281858963183, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033101491874716737}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.038187194305397985, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0022349877900683045}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.04011310335796683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020275784302354574}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.03568595727216617, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018632965400610909}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.15277349949813956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032872529807419252}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.16396576376162655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031054755283307263}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.14564552170447934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025798467681516775}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.1550209051635969, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003284288318473702}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.1688330220331253, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032920982306984}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1485833712808272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026125773661705726}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.8941294842971168, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13215157825403162}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..19c4941104e2f73bc7fc5ec825d1c90d495fdf40
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.26689392594772454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004337626289582131}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2475592856825416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003872465100596751}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.2428878538323252, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003472823044516277}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.06403837156878552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026911466509326534}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.05835175860926151, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023224836030261964}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05762339608594923, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002281322761921692}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.20230244190777438, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036014595491025043}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.1862105332580683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030118849618256224}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.18314939874317032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028003189953992335}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.2037770460842929, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003596337844803637}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.18900362833831452, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031234507389558894}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.18499281034459034, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028200577177087296}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.3498604987705147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19596177585353966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c193b0403c76d4e7b2a73217240f4f6067e20427
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.26286944517436134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004547016993765234}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.23922210627832416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00399242210516383}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.23937434803922442, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038022538947401112}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0676498945961149, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002738019053264675}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.06043520173252685, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023483245889642074}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.06083246734082769, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023310201660808163}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.19937399365009664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003777710368463551}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.18013165514185406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003193049282501879}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.18064028651608502, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003098457083973783}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.20046547390236114, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037863004323181425}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.18165174901569578, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032501434351904436}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.18183242521164045, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031193531188508203}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.299640404449182, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.22526369317807593}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5ba6223109a1667353e0b3fddab6ce21bffc16c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.07165582917605977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004574396501698046}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.055218754309569865, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00340554578019057}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.05800954094661802, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003496221478335308}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0170476050564283, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016378678918016383}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.013811166550490082, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001320549075325777}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.014481860106880176, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013623544383952462}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.056339128495289034, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003806969072311165}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.042227353071765686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002652946223493758}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.04451543084785548, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002727104103466008}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.05702888046338904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003844631247600613}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0429696011839616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027150651579781033}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.04521446022524568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027776518676359114}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.06983354124732599, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02299895135254461}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..584223f91c58c9e536d855f693240731a15a45cb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0018831909793706626, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005784722940521672}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0013454709318106806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0004064972952939784}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0015297630264254271, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00046352326818864206}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.00010890576928312776, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.757501140277646e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 7.004002287021154e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 4.9514840646779456e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 8.502847929189122e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.021299119569327e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.001468303303617866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00046494145314115326}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0010215023772211784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00031150675030788545}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0011680695487629835, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003584965566749215}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.0015790808908105478, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004907819897375802}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0011142872267091062, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0003381488629476795}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0012690548984618838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00038605695478641824}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 6.30933023510575e-46, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.4028369955982456e-38}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..352825c635ddc0cd5ab21f967bc21fd5d32d3081
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1357849290108061, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022085026440544792}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.31125110203110956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00478150841719741}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18544940835038826, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002792598389230579}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.02806264964322551, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010490970791455262}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0676376603721838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002556389169521471}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03905778248153537, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014517464622913717}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.09941478269511335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016710762156098235}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.22831881338494908, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035475652964304587}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1355227325800368, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019961537320126754}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1090062194949961, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001862164401400258}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2506136306420929, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004002812528613584}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1487940648277913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022923438101904283}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.6078680919808515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13305526786747834}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..666f60c20bdd5fbd93eef3f939f381459cecd3f7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.18487966767987538, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036696902142416914}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.26149928474217465, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004228571030201456}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19812664092391155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003093672497471887}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.038450464485312766, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018721013867502832}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.055093479727191086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024314883171235226}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04107507874444822, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001790944203843995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1407652383762195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002867258911637599}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.20074896873173023, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033400948802297637}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15109086002249855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024037820360525127}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.14290226133870376, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002899542963069592}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.20559699548522795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035828941817263715}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15400915124743492, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025002059384586265}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7426173208536198, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06390989391271595}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..95393c2a35bc9b4602324e58ae1b5601af94f73f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.20654064116685483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00402670019732593}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2761636603625548, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004261678708829253}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21646360740135792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003295332693289183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.04926519014044061, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002298517760897842}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06300838426900698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025900885284329783}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05024654283296407, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020978860757402073}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.15783804831230816, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033092482048118115}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.21095994736677962, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003416543139413067}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16496562440527163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026851454383202047}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16058710601356815, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033312368166406067}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.21694184867744362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003641660551599739}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16854842502615797, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027533826606543164}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.212042173147054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14754927518486374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f407e833a51080fbe4ed8261e9112b0a50213cb9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.20968934404830927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004355318865286731}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2714957979387898, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004428046228223116}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21602695443039088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003545745499002209}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.051495808432128344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002483343433806453}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0626272643756316, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025430698963779036}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05092039724374764, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021028017555591145}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.16001543314957897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003518230552839032}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.20782119995033888, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035214877778981}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16465282938122464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002817348856079681}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16265225378046935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003535398294040215}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.21338945027387293, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037313572424076884}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16805945710868087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028771833437224903}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.292440773010242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12688961626559822}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3aacbbeb40f47ffa468f826b9e520167a05041c5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.066317893890864, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00423472373517398}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0668307970074077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038903454010429882}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05844521610204345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003357755822076642}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.01516582795790956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016864721571447335}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.015164369380224488, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014785207392795535}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013427211505942963, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013325070824321361}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.050958054485125935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034077121993672244}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.05078596392429664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030413833239327495}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04435132297085604, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026237858818432148}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05197463436429066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003434452220019136}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.052735906009332555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003177823933607375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04569745564992877, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00269219477160947}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.34928748143158433, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06968377794297799}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0056415d0fc787404ada208d36057bda82856341
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002581036492851647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007454396977849946}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0019439835173572402, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005504845967048701}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0021633187762348204, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006160758089028403}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00030731846769582613, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001384434719538918}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00020070076673850258, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 9.047229228571321e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00024003297882235707, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00010755582565714237}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0020398313639409388, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000580520638938891}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0015747914931411474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004430161114171138}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0017320310946117893, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004885245020078372}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002194331414389935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006175664744399804}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0017109237047450568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004811354022989026}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.001876404823891179, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005270763478001978}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.845610532495598e-43, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.163279325803633e-37}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f4fe31a0924817f8286547e4d6294e30604ba92
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1392916956409755, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018481196716208217}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.33138070505525874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004276391177461391}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.19341295101992514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024605445250390295}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.027263388973015015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000983425389980829}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06778244092557704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002505239823925805}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.038300051518121075, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001369595902243146}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10118400957605353, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001314306255628095}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.24197014523792498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003179571094954232}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14060204154591033, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017583084261119269}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11110341692578951, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015204648459418455}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.26610036052943387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003666908475046308}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15451525716329884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020462224153125503}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.5021646844749303, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06803860105093382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ea2c94caeace851bc0da13b9ca60cc860a3cd1a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.17917229015158215, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034616270337872687}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.28234191998252756, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004328228914698034}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20089012009350235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002978496072546475}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.03775920828695508, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017965171904386755}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.061221540970532014, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024602671401857466}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.042449076402835885, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017537376058430483}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.13568899758643302, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026852949458789316}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.21508610392050376, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003368556633613049}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15212125887603897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002276633466962906}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.13860339464985375, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027145150651571065}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2221076063699141, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036838975194121463}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15618747415464568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023944466167414584}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.7438099023912716, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1601928555347937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..21ffa8715754d8623a26eaee2dbdc8468b7f7237
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.21314917496652097, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040600258534636575}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.28017732093506925, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004156332947155909}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.220351016432514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003188805586738867}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.05029543701564033, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002351457071629338}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06347492383046742, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002463338288106309}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05038721991867873, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00201183920236601}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.16282719781485017, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033585840074919944}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2130973674647334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003244727756549482}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.16750321297579993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025679565650771298}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.16587114072654197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033789416242402844}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.21968584807863187, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003491606534581178}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.17145097615519603, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026315536730007077}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.0985070297675317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13154446649102086}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e488bd24df8fc3820d5aa80d2f2b1206956563a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.22280760154852117, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004390904652685638}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.2660670648202862, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043755645907187345}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.22126778715622059, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00351049034903905}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.055569530372989805, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002485660853955144}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06528945138217346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026099692030146695}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05451394163011369, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002197533206236956}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.17071198449258404, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003584396198060428}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.20296992347781656, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034513932905057944}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.16849308202989022, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002789201858785969}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.17303337856565984, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003592404654601901}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.20861557263054867, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003716319779676917}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.17168891280364598, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002856280916182101}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.696335421679655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15429316902371332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..525301ab8104206abc1da32a316387bd92a494c3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.06459589021014682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003891095380846983}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0648529858041251, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038243852386048685}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.058187417979200434, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003298768398525437}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.01418629371385429, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001531715516561685}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.014177449560852498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013311123981377007}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.012582080149982773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011999611107423268}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.04942622014780535, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003112611733184756}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.04830459159444718, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028657851487914514}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.04360619288210294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002517973327420018}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.05058606223316683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031519936739739054}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.050243110562926054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003018354788190834}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.04491124074474275, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002579713572678555}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.24972558565783795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.053684508836508174}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3567f21fb38ce4bd90ec7c862bc90ae4580d0a8e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0025190479973907745, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007080212578711208}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0019522897365303632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005315917820907907}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.002161934536709088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005948885851124559}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.00038639243032022986, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001470964395764042}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0002587271219346691, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 9.980221995641359e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.000307333743214037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00011743634313830978}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.0018826374863526564, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005147273086202652}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0015177893314996913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00041368575160514705}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.001652183701042147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004498633984143077}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.0019969885440999403, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005512037388198252}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0015731204884741835, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004244780256693147}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.001726760477833854, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004678249512576879}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 7.469531234917508e-40, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.8389499820293147e-34}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a504042a298e512b413efb71cc030747a26ef08c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1455268959483828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018622749607537018}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3423730472851513, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004243968510683615}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.20150117996288208, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002475420919807577}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.031113013875646793, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010564472781666096}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.07657376393599181, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025908898115614702}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04364050860274179, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014638169607378785}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.1069927606611518, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013238092147911304}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2531894635221878, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031552855238693913}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.14830049281611307, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001767565901905731}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.11608704788644836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015422029588228266}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2745676793472743, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003630899569864946}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16095008318175014, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002069319454417293}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.69159177851078, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08597377916560887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4eb40019e91f096e64b0f9c9f46f935dfe25e73e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1656766838106151, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003311707578975503}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.26380436066405916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042896104700150565}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.18687918757076946, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029547400275911884}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.03192700954228503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016638524448754533}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.05133067604137321, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023201541616321535}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.03611720929073555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016831709482522717}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.12671804880247828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025399475086942853}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.20219533841657047, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003293526711844687}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1427925316976916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022483842833851977}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.1300589316850642, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025785593599623085}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.20963166196523544, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035927461486533713}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.14726996806804912, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023615534405494057}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.5154327790854452, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09075795006322013}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dfa14955fc328700dac2eeec916e309e1f54db8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.18909484566599685, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003950819844394485}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.2718769604920019, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004223865744923052}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.20341496217018784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032458847070410616}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.043554704955756154, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002176621642380098}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.05972277200498379, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025667116192052124}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.045462384799217964, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019968008810941603}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.14613433508915188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003224590842312493}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.20951638761821206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033167476690494297}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.15660542083343884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002596200719484403}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.14840970177848015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003239839270586971}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2149189211868496, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003570016109369942}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.15972610503232226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026706170411476843}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.8561837473144438, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1652517173077056}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6bdd1997466c09c215f70f72613f4f2ebe8c7d0a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.18257690791972123, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004006918517667934}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.26033673075849767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004352519745393548}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.19598928489102377, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033814773326124085}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.040870316407448856, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021374579637365066}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.056270108437116266, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025158970204257696}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04279947767402745, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019564637943375314}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.1409116045729758, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031679784442123723}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.20275659130062612, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034509880571316465}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.15173273328212009, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002671616136858398}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.14292997261659537, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031880065498972026}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.20768819850292705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003681311843517383}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1545405886375339, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027449821535318057}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.0809769345599323, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19432487424847153}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..29eac08fdbbacd65d1fd2925c25f0c2430fec894
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.05108032804071894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003390049476133475}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.05880001625627721, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003736477018998282}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.04890767455000632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003057134704734198}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.011543780490512906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013067634333148541}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.013097981096374263, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001376718188766686}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.011057807884388515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011751788248271782}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.04029696513661245, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027502948998389862}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.045549874016116855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002924162344624697}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.038159094630815064, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00243434744749608}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.04104440973285439, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027735874936627284}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.04713611215249886, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030466751631097844}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.03915726021159841, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024864361723350315}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.2880719761678462, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05189985195943128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..167f65055a0bfffc6bab74ceea42805db32e13d8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.004002287021154945, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00150882185401594}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.000505816634884062, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00018048741472614392}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0008888379700377073, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00031704658560943185}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0008576329331046312, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008576329331046347}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 3.430531732418525e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 3.430531732418596e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 6.597176408497164e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.597176408496901e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.003716409376786735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014258502483873398}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0004553676388190836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00015803630071122688}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0008030746767272441, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0002802822184911789}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.003716409376786735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014258502483873398}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0004553676388190836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00015803630071122688}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0008030746767272441, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0002802822184911789}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..00dad93da598a8d753906745446d47d038060758
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.379963927829925, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.2573526772203268}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.08430156983395772, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0023853160409478395}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.740846474029314, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006167016289652982}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.13918517262851055, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.003031126931687362}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.06966123539471465, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0023674498306732014}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5868627439659466, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.00770185112252964}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.1134449009766221, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.003040297061765627}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.08383375028656737, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0023771360831379928}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7379733730054571, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006199103560931277}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.13842755446553195, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.003019229498486827}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.0822443257804446, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0023831466991385618}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7222343828686507, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006418079058535647}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.13560704839500504, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.003029190011581066}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c9d6afe3587b72669a9cc52ac5e8949b6a742ff
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 57.42465098900179, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.1078815648831635}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6808088885246102, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006796979519538104}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6336659473598535, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007414216432858779}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6357497501654351, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007151203933587638}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5255864223795965, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008044938147511789}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5006490922843432, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008204408297570282}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5007567674149286, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00806429537616059}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6610225187736701, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006991919706073141}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6202751689797952, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007617793392838218}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6211515568694629, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007369923868292126}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6647318479780407, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0069741652380140694}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6222421250530864, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007583094680125747}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6234521192835725, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007332794233638315}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1b870f9e4ce2494b9d4d391b371799bddaf522e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 60.44635224572798, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.3446237122314244}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7036876644842045, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006496630984767584}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6582369637194351, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007108402892336969}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6632876537876221, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006816688459749599}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5507183829339078, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00788053127659401}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5241620180691751, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008047211431302723}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5268362804678172, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007890623827590655}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.685976181176324, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006688246337688496}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6455843829048997, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0073101108152440475}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6497429564395274, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007038662454206784}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6895552326789159, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00666828093642199}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6474552209600517, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007273618289358875}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6518983844565237, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0069968600613674855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd166581037a16f1ee1debb2cc531ff63554a30c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 63.99859340473141, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.0181011163291809}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.718671836461919, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006294527586528868}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.685273079936742, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006773596698921195}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6872300555455244, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006552132289138089}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5718331533007336, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0077333134242707}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5520767659854839, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007877835743632268}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5527349338268147, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007756677211709758}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7031204259857672, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0064960038038979605}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6734324094649778, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006990313714994473}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.674737159574106, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006782676101560994}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7060532814193256, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006469915199530839}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6752154921932333, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0069521353853381936}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6766918854024231, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006741989553841211}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6c9d74c72975f9de91bee0b61ad0fb8f69fef1f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 65.68343284307088, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.4512625096573333}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7245064757365981, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006186217724643701}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6966042289172039, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006598392835454942}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6972916814165189, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006402402634467684}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5796069278121242, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007670281267649787}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5620584766606035, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007812313870631962}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5623294588317255, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00768450794995948}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7098784243871913, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006413128988144364}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6853303584523095, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006832123614751252}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6854739713954158, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006648892928897857}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7125589718597797, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006374119063501137}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6870178382316655, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0067907383524628976}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6872870920135151, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.00660382042497619}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fe3e12cb896fff02ab90a4cee492900db0768cd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 67.80635157272806, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.150124553790288}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7319220129062811, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006041436418049977}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7105094894163213, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0064230522240220625}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7098871630878016, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006221650632659693}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5897715500763321, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007568603611022888}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5762294144509804, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007700758909051833}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5754350955997378, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0075768108295479145}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7186482704494902, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006254631595816518}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6997515950999912, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006643987175394352}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.69879364437856, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006453508082021247}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7211579539003966, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006218818928365821}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7015037185292219, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006602458898563086}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7006368042688799, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006409941634775962}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..07e8282569d5ef1aae1d8dafc339fdbf2b901e89
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a62fa6de74f80cb351a97a674edfb772ec478519
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.499455930359086, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899178}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.499455930359086, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899178}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c0aa97cfe6fa38c3ec87cd88c5ffde2988f033c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5114254624591947, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662778026451666}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5114254624591947, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662778026451666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..be06c7ecef60723ff1f46e7a03445e3f2f52a6ad
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5195865070729053, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011656869979288453}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5195865070729053, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011656869979288453}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5207528f6e12a1006d5f2ced81b5ac1d526292d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664055982032837}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664055982032837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..13be3851fa26222a8c9b22a2f93dc3344f41082c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb8901d613af73e40e0185b83d35f5a93e7ba366
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1505467019035778, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.014682785810059218}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.020272307866543455, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005265772258608504}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.2191478275652487, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004134750154842083}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.035170959301194286, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008338435093530204}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0030507521899887356, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00015211877263159542}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.0381262159440338, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.002020826886323144}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005364179202157353, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00025695320904709255}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018486089076030096, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00045586856654085276}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.20445528794712298, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0038441092026702847}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03215186695687886, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.000725133860372767}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01668000026963013, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00042749970187600867}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.18840601223965645, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0037496927990254393}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.02901357418513821, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006789454181899272}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5af8f2ba0e8324e490971ac4ba5ac5f7c7e36e7
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.2943694159026868, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.019001956315490544}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.06889633960575121, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0029528371645033075}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.1749622652355825, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.00418913262605897}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07114356718637975, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0024530989412474146}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.013747092252940312, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0010815279565432835}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.034659257546783305, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0018737240504667597}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.014533470414966138, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001004705308911276}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.05678669759372553, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0023995612619735897}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.15634194763841833, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003842783856016258}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06008732553084146, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002063583089286293}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.05799081392533638, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002547867930613753}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1492710512090213, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003695060862834212}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.059790388949648735, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002132773374026426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc754ce45625f6e091bca53e9dd3f24ab96d3476
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.7429861146921837, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.041630683755388004}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.0782745309191772, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003661536035808258}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.08833366558571519, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0035763050033546058}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.06384482992877637, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.00270802656663024}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.018971204567379225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.001716087418066384}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.019513437693357994, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0016266277324259893}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.015161629432520633, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001361177448233304}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06733854475743022, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0032077160777706985}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07767663804319452, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0032078055204207683}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.055271222595081285, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002403409792808055}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06993779483360749, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0033484481247688168}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07770123233082768, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0032126513659949513}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.05676099920154195, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024748610608637403}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..898af34475ee359a2702d28e383e9cbffa20e2f4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.5886974488567144, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.0588610972424531}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07828648309935715, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0037910985558715117}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.0748434759683687, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003447795552872216}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.06127218818801527, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0027834226920066146}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.018883425449363365, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0016912212541414398}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.018651177545109577, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0017901304066660615}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.015325075589192747, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0014230205586656989}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06753098472798698, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0033223511190554516}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.06602107534803005, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0031350941534849562}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.05326156438733179, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0024939890097848264}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.07000376450757803, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003449013306952803}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.06699980444105336, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003167802494403317}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.054652079076486976, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002541866737238746}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb344483a2cecce641175bfc9effa7570e245233
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.5677418183572901, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.030389830098839228}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.08869524418004159, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0039919536068413145}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.08221046365791854, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0035462487851992963}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.070280450522492, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0030013676962094026}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.02184402587413536, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0018288818965170844}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.01942179813273096, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0017457514434629806}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.017234383744728854, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0015043166573897747}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.07691570690053545, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0035155841708412053}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07231895867275351, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0032076680327550494}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.061159781312434845, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002675699108061221}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.07972711816697133, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0036660570172046473}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07381189517508352, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0032437272615878235}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.06284895391649062, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027355571624958492}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..533acbd665d89257cc41aaa4299bae05f1501a84
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6340158799904358, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.07074410475536543}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.0980231979267413, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.004088794455570127}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.08967139314514878, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0036211300336253882}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07798760826694028, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0030806394748408095}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.023143447865316625, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0019008288816567988}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.02009787194324948, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0017663286220115724}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.017977968875879748, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0015069906896766742}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.08522653187716864, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.003609502601936802}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07920267912848071, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003267765504312832}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06817709804550225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002749241681504631}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.08765190358636991, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0037121088015769177}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07992387309151867, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0032810895663960803}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.0695072665160449, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027876585365403865}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..310da9c8e10c096edc2fec2c36f60ebdbba876e2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4956474428726877, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166538214464238}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4956474428726877, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166538214464238}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..415f099f206c9c1beac10c6ea8ed608a3e18caa3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5065288356909684, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664829595210969}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5065288356909684, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664829595210969}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..697f9d1545c185a1dcdf0b4366cdc08bcd9f981e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5168661588683352, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011659185184878913}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5168661588683352, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011659185184878913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3383e75dc30af2b194ec85c41cd09172779a9dc2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.514689880304679, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011660788281735494}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.514689880304679, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011660788281735494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..60db0922d73a288febbd9bf616815bb6ca790da9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.500544069640914, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899168}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.500544069640914, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899168}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e634ba843830b3ae815d5ab724ed42d8865cf1fd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4896626768226333, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663330673075898}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4896626768226333, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663330673075898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef5557f3380aa34eee704c846de7fd1c448aa7dd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5718171926006529, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011544859155318846}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5723612622415669, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011543009623282832}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ee5f7e571f3ebc42b085edac422f9e085295922
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5625680087051143, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011574126069682387}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5609357997823722, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011578865649321295}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fbf1ede466492dc524d8949887c48d78c1abd83
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.55930359085963, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01158347809065713}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5565832426550599, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01159088337366686}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..39f9737873ea773d933573e249460b36289f106e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5620239390642002, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011575720065594108}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5603917301414582, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011580417248656574}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..05e56d6403db2207d3806cc47b5ed9c538ef4bb2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5609357997823722, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011578865649321297}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5544069640914037, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011596554080987647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1b5dbf475bfe30bc6cf2f6a197d967234cd9fad
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5565832426550599, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01159088337366686}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5522306855277476, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01160199979686681}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c83c4b4ba3b3e8d2844b90f0111302ae9276e799
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.578, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01562562511262066}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.499, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01581926829057682}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a75d84986d46664c4bee450a495cc721e511c4da
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.645, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015139491543780529}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.62, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015356947477797582}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0096f49b51d5b81b36d5c5019b6b91bd9c97447c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.66, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363937}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.639, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015195720118175118}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1864a6f78620843f77948367249e151a6886007e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.666, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732961}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.651, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015080663991563098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a013b9df84eb71f056d6e9ed60b18b3f8ba90e9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.677, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348635}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.661, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01497675877162034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..006477fb3c225d5f834375c82fc69230e3940f7e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.682, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0147340793093119}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.672, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270336}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c3d09a64ffe1f50646359a93b6632e27c9177b8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.849, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011328165223341681}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.778, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013148721948877364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..99252517356222aa83d727f5de3c4475fa5f4f53
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.894, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009739551265785129}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.876, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.010427498872343968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ebb8954d125fde379f10ba19d17f572d50bfc25
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.898, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009575368801653905}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.899, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009533618929341006}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ce333280283b2009c85708f00f8915573b98033
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.906, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00923305200078773}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.911, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009008893392651547}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..077ebe579d2891efbcfa71002f856707a2357a4a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.913, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008916866630745904}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.916, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008776162089491104}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7143e37df70e6a8e29da248417c932a24d6db63
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Direct-Question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.913, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008916866630745902}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.924, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008384169266796386}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..254dc4a62b1baf46ea4d633ba118f59ca2ed330f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.353, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015120172605483694}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015275252316519362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd0ebc91aa14dfaa37364f524cb0e1ca5945a838
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.369, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015266698139154615}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.371, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015283736211823187}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..af4ca8d8889041d085da2da320bc66f21ebdcb33
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.387, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015410011955493935}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.388, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015417317979911076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..87b17836b7ba9580aec45a365ef5cbdc50ce4861
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.401, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015506109745498322}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.406, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0155372264386346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..acd960a3886110bd8f93cd5bef5576582ae7d465
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015431725053866611}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.399, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015493193313162906}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..00fe5783d5267814b19185e228cd97708381365a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.389, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01542455564730849}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.386, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015402637476784364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e399a6526e6f12e3a037c86165fe2c176a10f1fd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.431, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015667944488173508}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.413, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015577986829936533}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..78272f5e9d6f11173fe6ae8470b7e26d7f0db575
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.447, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01573017604600906}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.448, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015733516566347836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0678482a013bb001d7bf23e83b716c0ff306692a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.481, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01580787426850585}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.479, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015805341148131296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9fa61e42236b1e4285536b1472e940a4af70ef5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.511, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015815471195292682}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.512, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015814743314581818}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bd31939b0fa2d856d6370ef75be8cdf061f921f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.494, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015818160898606715}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.507, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015817749561843567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c84522caf75797047d6e6e996426a70de42943a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.499, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01581926829057682}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.501, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015819268290576817}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd5c1f948c3ff03ea454868e67d967b4fea0c4d9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.417, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015599819048769616}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.424, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015635487471405182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..edea4e9eddcc3ec89d99ed5837c989641e806bf4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.438, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01569721001969469}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.453, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015749255189977596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4ed6da22015ef308d6f9263198785b42cdc4694
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.438, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01569721001969469}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.439, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015701131345400774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d1a2507b14d5d3c6e6a4dbf39310442593c66dd
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.442, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01571250721186421}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.437, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015693223928730377}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f2cb9ef6aa20124ac7b8f1bcd00330e9f026afb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.436, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01568917302314407}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.444, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01571976816340209}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1da0cee9c85524f9b2ca1205a6fc1883f0630aa6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.422, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015625625112620653}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.432, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0156723202373362}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..887e91cc6b423b764013f537fa3e283058c37207
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4751469802244789, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01154813982307477}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5045430251202565, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561954965856519}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a9bbef95fc25f0b176d601fa1b1c05bc76f07fc
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.5056119722073757, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561703928784327}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5114911811865313, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559378273599128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eaa4e166ecd469e9f049c013781a839852dc6f67
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4853019775521112, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557435464292914}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4922501336183859, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156104327886354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c18c372f215b61c8240952a2c66ea124470f9bfb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011543509045585206}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.47835382148583644, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551591851683337}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c583503df55b188afa65d9519b474cf53205eac8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011551049647290314}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546234813777395}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3a8b12d59274d6cf4f8ad1719a8f1633b3000f9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46980224478888294, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01154132532033662}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011545573278697237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc49ed83c37876ffaeefb1eb81df7d7d5257f0cb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.5034740780331374, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0115621531491683}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5200427578834848, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011553138977961008}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dbfa8c5eca16b36e3a8312cb96b3ba1ceb08c8f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4794227685729556, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011552636515221856}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5077498663816141, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01156104327886355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bff425caec44b66cce6fe428ff0021bd193cd98
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.47621592731159806, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011549343521088358}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4719401389631213, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011544210396951672}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaf0dd99ee6491873d9b9fb56078e7d5b22648e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.47033671833244256, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011542066509767008}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4607161945483699, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011526690316014587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..60b716aed0e41717bb60e2440dcdc764ee3f89eb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4623196151790486, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011529552555884573}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4564404061998931, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011518470676766509}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6bba862d7a817930fe1459a2e95f88aba8521cf4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4623196151790486, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011529552555884575}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011528611805439891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2042a9bd701be41f6ea8b22a5a780d80f3b90842
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d16661a2e02be1afd40f17924b50784c2b423deb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c46859e978d58bd353b0fcac89aff586a707d741
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f41d53370249d335974631b0d6f1914adf427f98
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0ac7912856315a7df74f3ddaa7338f8ea457949
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..da9996c9622f73acb4f6731db74d84b23a840583
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3ebd5f372b91a65799cb4168b652722d4f8c6f4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4826296098343132, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555452669106635}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5024051309460181, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562298481438055}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fff7bd90b1d67e7d929f7028e7f6194b3973642
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4938535542490647, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156155858904076}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5141635489043292, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557792331301674}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..245963039145ef851b3cda8bc230f0a9b6ab1aa9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4853019775521112, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557435464292918}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4922501336183859, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561043278863542}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa333a43ce4fc7a2f191500815760dfdcf88ec57
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011543509045585204}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.47835382148583644, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551591851683335}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..45eb9d303811c965db5a0926a3699e20e3707cd1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4751469802244789, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011548139823074772}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011545573278697237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9eac6139d9b7ad6ee3b70ae83aa050928c3cbcb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4735435595938001, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011546234813777399}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46980224478888294, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01154132532033662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5ca81e1551d5a0697aac4ebe8d07958d9c7eba3
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4965259219668626, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156215314916829}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5146980224478889, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557435464292923}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..570e9e40e8914237ab3acf91e8b056c0057f9268
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011543509045585208}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4997327632282202, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562430600098487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..01abbbf4ffebdf7e796c692e2fadff84ffff292c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4767504008551577, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011549925483927454}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.47033671833244256, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011542066509767012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..631def040c1cc39b890a68b870420101589dd749
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4708711918760021, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011542794417345719}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011530479981182624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..42f2e6aa81e85113b89bca3756a8b450be865b18
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153229486915312}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.45430251202565475, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0115140402455835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cecc0330003e40e743aafffd2f06af78b943f6e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47835382148583644, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01155159185168334}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.46018172100481025, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011525709570367504}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a35b7fcd85200675560525f0dd1235b6f1820b5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc774fa813a33e4981cccf47dc7c047e386a1cb5
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.51985559566787, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d339e3af1a4cc29bf1ba1bbbea370d846d0a46e6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5415162454873647, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029992535385373314}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e08939b1970f11ec869f03c2c16ca649987e8246
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e298473d8b37ae5b7ad7f00c18d576d18983db8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5451263537906137, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029973636495415252}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..30bef39b98094b00b50c782751d4b68190d313a1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.555956678700361, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029907396333795997}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce1c1f65a84b4219282fdc385829c6d43aa9e4ff
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..72b669077b878891d849c0d9c6d0997f9f65e22c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..084a76478ec26f658c170c51d26d675840aa907a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..17acefdafe27ed6a02688f8c9bf0c0d064dbf682
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9375b17034d264ca4cc3b478d2633ab6ea8507c8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5631768953068592, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02985524739031494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef39251465ba9ecc19a802c125818e90d8245fa1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac91d34fac485e7ac8ade770aec579f8c4dd519b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..15cb35696cada5dff9b09e69a7be8407fc95b22e
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bf6fc4cb7800565d7d621eb5969a7912af7ec
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..11888030ab1f38ba255edda5236529e97d534eac
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c0c5b17f33539ab316f1f82a52d13e7fc523402
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1168e2619d8eafcdcc5fae0fb6f8297af90089e6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4657039711191336, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366426}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7493aef8cf359de7c5929dd740fad366ae46bb68
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a5470d1823e0661518b540a411af83f47c33249
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ead804267210295479fb57f54d86c8694a338020
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..45b8f915af38503a8c7832e5a724ead353c641a9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..578afb6ae03be0035796cc0e29337d73646eadfe
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5487364620938628, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029953149241808943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9600bb8ef9ff733d4e981bce9bd922d4adcbe478
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..919c8a940c8b712e3f36c71cad90f5d77a781160
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..156b6669ae76478a1d10b70bf6cb0dba6a7419c9
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a870fb5f00c600ee9fd5f6cb187f8f665640847
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030063300411902652}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..11dca7e344b160c06290b97f3b39f74fb72af42a
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1ca33b10426d382c5aa24fa4caec2e1fcae1508
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..842a418c5a5e1b7531d66e969f985f26bcd624dc
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529113}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dde5db3a6a3b7f048e8873fa23935d4e756f95d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824192}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4925019731649566, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405090552122858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1348e2360fc56fc5e0505bb4bdf46cbfb654da1d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5280189423835833, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014030404213405784}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367585}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..826e290628eccc691eb50bcbdc02dc3fcb3ef399
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049294536290396}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..be56807ba23542887b1ebe4a20627ad7de5cd484
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5209155485398579, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014040185494212949}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2bf1fc8056a387d0dc2d083e378ceed8e6fd000
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5114443567482242, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404880419985932}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..18809bc8f300acf27e10e68483d2c0025e65a163
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_Replace_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440419}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1045f5bf035c42206b729f91ded09f1048227250
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4964483030781373, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01405213114691586}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0140519560640769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1da7ca6ae3b931a9e284acc42154d8f60c29397d
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.489344909234412, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014049294536290403}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.48382004735595896, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014045126130978596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..87ecdf6fa816e33bb5fa2c5e6de720d5d25ea041
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5035516969218626, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052131146915845}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052446290529009}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9d84986cef24e5280f434a1086c7dd39eac11db
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5082872928176796, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014050555322824185}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5035516969218626, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052131146915841}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..19d78797e1022846932f95ab77ae660d1aed0ad4
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5090765588003157, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014050170094497704}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5019731649565904, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052376259225629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..92a2ee345533ea2b3e125ec15abbd67c6ad2f28b
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_True-or-False_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.510655090765588, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014049294536290396}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5153906866614049, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014045826789783661}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae4c806e0f103a3c66f2a61c66dc6af4dadc4139
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4925019731649566, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405090552122858}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48855564325177586, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048804199859322}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8432d9ca0b28f8fd7c0513fbb8d5af4608110cdb
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616441}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915869}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1141468d805e5f686b8a535f1b81b9d82a3f1aed
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529015}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..98444eb7e8de4120b6ba40f0a3f61f97b4c23ce1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049749833367585}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..88ebb082cef4dbe8b69ad8382b52daebaefae173
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076903}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d45cc88e6d0774b145ba2c4f85f15cd8c3ba7902
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405090552122858}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fbc5280f7db13c6e19e1327218032e4d572e34c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616445}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf3de087e186ab57a2da0b34b999d41cdb552b2
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5335438042620363, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014020826677598101}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5256511444356748, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01403398095610855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..df7002b42f99876734623c2e0b42b96fdb59bf5c
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5232833464877664, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014037241309573642}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5201262825572218, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014041096664344327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e077e9be06919c5ec4010cde972d8d07086f0d96
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052376259225632}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f50d148479a26bff2ceff42b0656ee1dab4c4ae6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140451261309786}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5114443567482242, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048804199859329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4769b7ec4281b034e342438f45c3871c6eb840b6
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_stand-for_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616438}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5224940805051302, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014038257824059883}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_0.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c94243b3fd7a0d7ebb6cef7911639187eb66292f
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049294536290403}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290403}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_1.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef0da5dde5d662f6aa2144c3b482a5e8424485a8
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4877663772691397, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404827882040562}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.48382004735595896, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045126130978603}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_2.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..39a39017d13cb6e376770097461e6b647895c5c1
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.48855564325177586, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048804199859335}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4861878453038674, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047122916440412}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_3.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7850f747a9301e82c260a79d56fcba10fb516579
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529022}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529009}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_4.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..34f468f22a1186aabe163a960fd98059cea7cc81
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228584}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_5.json b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..783e0f4681b430b79bb4c8f31f5827ccd2fdfc04
--- /dev/null
+++ b/4b284b17boscar/eval/agg.4b284b17boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5209155485398579, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404018549421294}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404827882040562}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..78643b94511a9820f587a58f7847dcf1f6475ad3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e790663ae7718156b2c8b6126696d91b4828d51bd01485f68c2ecc8e0009ba3
+size 8208598
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9172ad146c430c57106e01b4c271b630124bd64a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:def20d08ad1cad43a53b7709ba8104e4f7472acbbe58d589182d8559ea1d9a68
+size 9398420
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..769361b9c0877b651b6cd77803f78c9dbd3445b6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ddf0d7ef95712f43982be152a113aaae208eacb6854a64f74bd0abfda9b9142
+size 11186358
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..859fccb9dced3a0b675fb20e59de2ac2c67494bf
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ce66de902df16d13fa8a006f07bb20a24a0d7cfa24d0245a696d6d393ec6355
+size 6503584
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e4066cb1ff8c76ed67b0b2a7c36bfd4bdaeb633
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84787e250d97e0952de381249c0ea1d0b6d2b44a7d76655c6170e40a547c6cb6
+size 7375256
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..175854c89e4acd58b1dae0179419213544548217
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:630f2c9345d63afd9a3f034e865b1be578e031f3ed3997a9d8a8be6955d4cc52
+size 8257405
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e11e7715acc4bdf19eb2faccd2e1ee1057a4dfd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c17143285f12c3789c245815b904d6982bdd79d1e93b5f604bd374d9e4bec8f
+size 8818456
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..56863c5c2283c0bafcdfaf5859f8db643e0c7960
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5c2a41d1558a7f1432b2ef1ac09081d5982a8144dfbbf64bafb9fd7f7de6d47
+size 6790272
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..efca2e97851526ee9b8d5fa2860bdfacb4903693
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3146df758b0d3484b4e0f297281dbc1dbcca95912d0a6248e1d8891c6bb682ea
+size 8249676
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cadb600e3d7ed79a0b028930c5196113647697e7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23ccc413e60321f6dc0cefdcb41770bfd7aa67b5ae9fb9b5148e51711734c883
+size 4851453
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..081359e2fd216841a9e912ae1cd6b8bfa7c9c43f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89348d8879112e4627e5c28d4e6cb673d8339fb007382182148e43e941e428d6
+size 5564610
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..015fefc1fad3a3048b75357bdd7d7254fa40a15c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5458c4dabf55938d5519987ff1fcd8af6783d6b929872083cfa84aa6e07251ad
+size 6285722
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7694d632209c567f3cecbb1fc6a1e3d03477c807
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35b17d7d3d0b1c71d4816f4d765e398853a6647fcb402523a1c1392edea1fb05
+size 8784192
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..094b13223dbb04bf47fae2176f0b1138bff5fc8c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f14a4cfba8ceb267f7c56bedb27168338d37d491f851ef835b7998e55ca356f2
+size 6358156
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..67c527f1a2a40e92ab4b815a024f609a088d8af4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fdeb08888c26423225a5d97c5a5c41b4eda3e588f0ee521f8942699f37093f7
+size 7568460
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..15a4dec8a16617f755620eeb63395c3033956667
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bc57d2c9a8ac1b83ed4662f0e895928fb5a4a82d4bf24791046629a0b358a78
+size 4436878
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ebf50bfa40377a3beebfb188611e440f48336385
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3e39ca21213544c9c3d23678e203f7d3573ddd6e6cb34cb1afeea65040f3fc7
+size 5095641
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6821d2e7878e729d0b11007ee13ac5cd202994f3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40d68ec9c348fe5daef96cbe712bcdab5a4e41ec535291d70db7e81817701177
+size 5752619
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..935005a567d80de54280b2213bcba94cd145f14d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba9346df728d2a2dd9b58d226a296c872d4c6d483d4bde2bebacaf1b2a8642dc
+size 7989268
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dfb86ddc87982bf7f5b7232d4300c40f08aa7131
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e874de65d93c2502ff20f24c47716de332de864e8aa861012d82eb5ad81824f
+size 7649046
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d7c702dd54eea309d116d9f08804d0898ed572f4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63a7986aca7684716457ebd6864630a12d19ca2a6716c33caf941967e9bb26ae
+size 9330802
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5c5943aa3bfa9bde923c35763ef262e212c2bf41
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be0f729468384c57aa18532813825adf79ef7d1d260081423eea29bf27339a53
+size 5483010
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c16a4ca3bf34cbfe3ee8b465615a87e3fa197160
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c82728f45bec1e4a7aff454145edcfed792d699edcb2c1cb46d82723cca2135
+size 6339615
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1d9668f8c1c970e06fff47ae080df7aca5eda5f8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aab1b12088fc164cf36d207bc1dcd3e2679ded9cc49b4f4f8814fa4308e73110
+size 7198667
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ddf7be66b5e42f2d5984d0828e1468c3bb6a6b91
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11c61651a6df1aa65ccd76827f35eacb972d822ef576a8e760036171cac20745
+size 9373670
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4e16d7bfaa609395020057d68a9a748b34636eea
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:022f9468696a88d59a3a66d83c0a582270dd5aa33c025ca16e5e9361728340ee
+size 10392536
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4bab8f66afe253c7f9b4f406db6a15d1002ea31c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8e8ce02a07f90225d625e8848dacbd805aa04c3b74bd346986fad4bbd352d8b
+size 12889516
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c26914f5f97bf7a1cacd5a2c18f4320dab745c98
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f93e6a018a5d9b27279bc7be6bf39e82a68d9b2d6bdfc9f263fafbaba8214a5
+size 7760948
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6e8bd0d9340641dace81aa6f94e4189c1eb01fa4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b056bb65917e6d0d5f318fdf09f8eee0d711411c29ea33fd5a8ba9e8af0c95d
+size 9078994
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1120a8b5e6ca743b43310e286d586e4af3828c57
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2557159e08a086573cbcc0a4c0b6752ad7649bb143a0bc768d7dd4deb6f48736
+size 10404831
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a1e06c1dfdd8de51a14076739bb5376685130648
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f003b6ee38f8c3ec5dc4c7359c7e1831af1063d539f690a6a20c9bdc9e9d3de
+size 15434852
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..492d0c125d87686461ca829527303f808a5f003a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea73c219ca0ed1f90464a30f5673da6c03291836f07146845b7f6077e49d96e8
+size 26834552
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f35f509a8d6e7664ee966c37ccdf69b16a3696a6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72cec6d946b7b62541279146a3af3b60259f6aea0da12a136e00edcd550337f3
+size 38098032
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e3796480662913dca768f11c4bb05d883938ddf2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cfb4107d806b540ff7a1ca8c08ab8b68226cb568c7557f72c70c21a8b6c2d3f
+size 24523181
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fa6ee2710b2ec804e426614bf0662f0e579fa80d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:730c76b72d4b2cb9ab8a75494402ad6d8473d0d9001d4beb0e68c12ed6f6f718
+size 29828430
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2189b101af4c53e93f9bd6aab7f98a50033bd142
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70deb511c7753261d3403f439e039ea20fd2fe61472b39ed4141dfa7b24e386e
+size 35281371
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cfab19f615e53d7b2d13c3449592027ca879c795
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f08b30b4338fa71a943b5707aead7c23b98aacdc748d3cce236ebbdd80ead6fa
+size 15388360
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6f15d74a4025dd8ba6a7347995dc2e265878feff
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a85b4d3acdd5ef91a35c0d0da72b09e6ea89f7c88b7ee6758ba652850a2cbf03
+size 26609434
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e339a91dcdeb9507238d75245d96628ad1cfc74f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70c08726361714e374b6137bf1882d99b504f54942ef410d2d3a13fe1eb4cdfe
+size 37769990
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bf7e894466d04c7cbef53c434d12ca880701c1d4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5be0a1e1ca57113b62b813d0039842230995126f7f00d2365bfaede37b031c3b
+size 24509836
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f70b967fe8d606954e1ba9c92b6960898c82f2f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d48d673410b43bf6c73c351273c5ab332d3eca5ea45e25a38e36c3f918f2c3e2
+size 29935287
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ea3e60e55e1c95062914a6209464dbefd6e0e1f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2f5a2c69e80d581e44fe880ef473ac8071037994bf494589ee0127376f007d
+size 35427474
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..309164da84e7bcc669e799a5ff00c81c8e42db4e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6424acdf9a75abb16a350e2daa4cc48d8d260932564e6e3a3091815ed6fd2243
+size 14649286
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..934853fc43d4727b89478b427af4f70738926756
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10d6917c3c5f6fc42836bc68c4c78b66c604eaf3a6cd5351de00a66f4bd2b277
+size 26768662
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..965a63d7c2eee719269502d0f1d12dfae61ecd1b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d958f8155ed7898baf94d5f9d9958626bfa47595df022b7bc58905279d66f3c6
+size 38261768
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b2ea886a93702e6da58bf0ed4e051a7ef80bd674
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebc1c82837c6f8c7a9c598d793d0dcb4bf32ac3646a002f1ee749f41cbc3c7f9
+size 24655499
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fef5eb44b86a491d71b029114d2be2daee274af9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e3be20eb1109ed4f8cd7c2c6c4c39438efd0e0afe67bb217347a3e750d9e6c0
+size 30040662
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d05a5775450a412a8be8b676cc76759fd94912cd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed854096fa610008faebc24e4a0ad5adb2f7dea1fcc12fe46fc9d076149c4789
+size 35570868
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bff43b69264aa0210d95a4695d8988760b0e6655
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcf9f380254f7629e62ff42a22d9f94759a0a8cd1021a89b9a73ab97b2423c7b
+size 14942760
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..211b62ed111d2b475d334efd96c4c86ab690a0f8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a1033e9f4e344a96dbe9f885ac526c8de19faaa338978cdc51e5ae473de991a
+size 25999566
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb7b68bfcf50b8d47464652e8ad51ec63330ddf6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecdedcb0bf157b8cd55a7076becf59747ba2b8c9fb601eba6d596defc5f2f4d7
+size 37113314
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8ea544bd8a6521192743938aab1b585f2d77d470
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24af7e753b6a31addfad7a1e4857e07f173ff028a3f8081ac79407cb43870f04
+size 24018782
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..459ab47a69eacbc91780551a8c3c54d8f357371a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34c467311073e48e40f4f99c6fd044dd37ad92d16589e66431cdfb8d531dc61b
+size 29364578
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..82a9c16b61186df492801ead3c1b8238f88f6ed9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74145dc7afa6fa003418ff3a280d2fd67e5ed83e8d4fdd344481728cc83389ad
+size 34782235
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c40b786f7bca58f830441245aa391aa4859fd4d4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8069ef69452ab2c4aae4c08bbf5ddd942e3e3d4c8c4e939937f61f7d631d2ea8
+size 16144846
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d346076428f31a95f202e2d01617b8f6ecaad46
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad92124a7597ef44cfb1e091adc47f4d3d413bca695a91846599cf8caa87cc2
+size 27559352
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0cbe285815e72c6e4a44a5752403253d9d3d0a37
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a08a1d4b720a15daf1d9899ef8691a1f3b80c33c92e0da1d4557b69055698929
+size 39598010
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..72050295693080e8762398d65a2e7aa2f584fc54
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee22b9d7de3ad254afbb529d5a12083d596683db714cc1946d126d23ca31f89d
+size 25442967
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a5b7268047e7dcdd83b522d8c534091d4fa6afe5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1653abf6935634b2410de019c3ed0f5ce3fc8c584edfa458c8eaccf67941d2da
+size 30849614
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a9e5bd883b8cd04e347a65b8345d5b1531ace45a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43ae9d5ce2b082bc4175d421568e80d6348fb19afa47c05f697b424cf1468fb7
+size 36471298
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b92a331b26d5953ec78928d9c711e7c532d3f63a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c924c0db1c77377bd1257381588083de361d245f46c256b5fde95876428711c
+size 1986266
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6ce9dc4f4762c8aca6c999a76c17e2168c0bc7ab
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4690ef70b9e79a6e761e84c4a031d013fd4afbb716d338952faa3888de086307
+size 1452301
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..57f41bb11ac12012e50c23dab274aece089eed7a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95ff7f3c02eb7203909acf507ee6ce5a030ec2695900a787e2d6143e6159a859
+size 1910751
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..76228a4f2af156e1d61fb24fdcdcb96c12dd8299
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21bc9e05d3b46c82cf2539d77bd95bec2eda58a29cfb62f07730f454abb1d4fe
+size 2367841
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07df55942cbb43347003110272cda8e8accb713a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bc72a78bb6b6c98478bf316a06f02a320a97e3a813ca3a55291b395f5dbaf4
+size 2822036
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..76a33e1566b8cdfbd6d0be5e3e59838afd623296
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5134f67261558f8e293515180853b3ad53ea29ac29501d0e0b0ef7f4ace14a92
+size 3278292
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..434fc1ddb547b602f3252a7c0c745efec0d22a6f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d4a129dd9f65f351d1183ccb38507176377f7d8c7cae533bfad1927b53d8163
+size 2406258
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b04ff90e52c6c22480cfc4102857127802803be5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db657646847542e07d6037463a13e0014ee695c25a97e1fb1c1d83ee39066cf8
+size 1754911
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c51543a8d93b56b39fc4995248a00a309d208677
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b31253832ae1889fa9a8cb5f3eb3458cc0862bf72d2075fe1d03ea9fee1d792
+size 2304205
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ac97364e7ac82e55f03d5199e428189e7bc036b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecdf9acd4c12e7a50c5013711a2ae3a4e4245f8f7c532e8b92138c092871c40d
+size 2852755
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..34911c9f9de38b85dfc77ccdb987a12930b8ddcb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32663fb3143d0636d8c96308ac2c67395407c53a0f56f8095ce69de169bcbcca
+size 3398437
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f50cdcd79a2e062b2fc06f02be6766a7ae7b9c1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75bb3ff7c2286278937f17d5d3e55814b6fc2943f08f1fdb317d94876594154b
+size 3945954
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a9a32eedc2acd5ecec5d179fcb177e81a96562b2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf26de5bcf7bb9810f09062b4afeab538d721c61eb433e27d4a4591b0679ca7e
+size 2016956
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91
+size 1478640
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2763d2d1bab712e22952a1c06a173b807dd78be6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ce9f39e36ea9d25dcb27b7411b993b1e968ba123fbce85e18232bbe24cda773
+size 1949491
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2a46e7358f34f1f5c94b6b03ded1b77e20ed36d7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6858975f788b3ce02b8b4561fe4db1d3abb0b3531a1878860b38e7d18a557dde
+size 2418853
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e316788a3993dee3ca45bb59029064c82c2ea832
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bac90eef36f6889f68a5c8e5afb303ee6ccad811a0de884793e11954f12b80ff
+size 2885283
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..052f233891bd4054a23639db39287de83af8f3bb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33104bab3ea318d78ae2ca2e6c985318094d8711174d3a08092299e95cf51a6e
+size 3353670
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b97e85f0fd71667e3114b4eedcc37264d8572656
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5338f80a1ff167db6925cf176f4cefdda7befc9c3356f08393053b96a45f0d68
+size 2320240
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c68a36922d4dfdac03106fdbc9e639aa84420473
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31684dd0c63f390122fc6711f27aab1672cc4d6c3dd3f115261a492902132479
+size 1668231
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5bd586999e4bdb08b22e37548377d47eeef3c0e9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da55f203a35b484a0c66b83610b828c0c2402b9618b02150f4e8f7eba7370a86
+size 2176257
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bb477b0b549cb6ac27b0ae9b0415ff99889e04bd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc3289a99e7673af47c7896f6c3f747602df70d116a881007208b8cb680b3da
+size 2682983
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4aef2df0909f970a422e0e665b46bc34a817b1bd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb82acc3f39b760f7b1c609a805abf5826a82ac4edaed3e04e6f68608aa9e641
+size 3186908
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba037d36b8609a54b1a7c74ae462e49ee56f0ffd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6760a76d33fc710f4d76d8bbb59a22525e85f03a4498408410ec33a05dbe90d5
+size 3692879
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db7f8fb52b98ce53b4210454610426ed9dbb9ead
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c118b1c0547102fb47d54b592c6423273f840e23fc24860c42e951196db611
+size 2053876
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f1b2f7514c28b3d0e60f7619687b86e35612ea5e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd21b3162cacfa57437728e6abdd185de9efb3bad975aa59294b76c811a384f
+size 1503640
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..421c68e01945e9e0201cc6e626a675918b362034
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:425bb401625e6c054365c21a2263f4c862bcac3cd699f98131fd7f02a3c3d328
+size 1980252
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a40a170cd66e2a3b0cb1f74ed4537d47a71dc1e3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:201acd130a60049f1a91ebb2c240887a1452ee82a66ee42d934c9d38cb23ca4a
+size 2455774
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..30a35e2b4caafdb10feba7ddd553aadcaf74555f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8b8260e9a1d6a99f7f2bfa4aabd65b2f4b92bc17b88241196f707c55b8b75d
+size 2928279
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c01a8c4971b2791a943f0649a190b35f964fc16e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r1_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97f592d150c0c8f283287b5e870fe25899a17e85f44c38e243065eb2e7a6f62e
+size 3402636
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..44324996431d349e64ba7ccc6ebf3045aa04c108
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6c5e2b147d7ad277e17f0142ebb3801fba18765851346ca733bad0b12013575
+size 1987050
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5f3acf79a0cb58f7f38fdd1ad421935124286c7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:971c9e853fd1ca0df1555f76e51ff5562bb13ebf08b00e3bc945028b438b6a20
+size 1447440
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1f89c13f97613794797ac3d8ee0fee3e120cd0b5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e82b1d78dd3e145a7665c2fbfb4ed9a149a092d2e64b09a423c2643706b79444
+size 1900764
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b907ef89d3fb90ec52c7160abbac7e1451c6ef94
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff7c13c2d8223ac223bf252400f84701aa9b01df66275f43606a3f1ab006cbfa
+size 2350336
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b581fade6a02ab78f340a64fc0778282b9a0156
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95934fac416522458bed185419f27224d28af4000d68161d9083578ed0c5b6bc
+size 2799902
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9f7bb51b8659402d18621c08c14ec105d6b580c0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdf79484fb3ef64e2d64bb73824e7c2edaa0c95fb966e7d451b161886b31b022
+size 3250476
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f7dccb8c4f0b32611e6cd04b56253f449421922f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed3024e3f491c5ec2d4c587c0c07d8cb9a4e0c6b3865f9d7f5b5a08727258d9a
+size 2407086
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..50cb300b23e40362b5beaea14e7dc89e56941437
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1cfe85f73b24e30ca5851068a1507e5f9e37ebcd7395c547871da55a1fcd73e
+size 1749956
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb70d563880a81aefa6f4d745bd2bf91fc342952
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca76476534e256fcd0025e79ec3ac29d25029c6e6b73773e59509858ce60357b
+size 2294170
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..985233791fbcc5f3a44bfed6eb02c1456288b454
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:589a800013c726d02beacf3d7b10225c224fdc89e1bb97a687098995e8ac1117
+size 2835456
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6482eca0b0808ac0701af3aab7b56d44864d5a28
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1fb1ba31af62f7e60252ffa51e29f9a3dd34a9403d5cfd46f3dc005c79642dd
+size 3376527
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5c05d68d6c3af3679c0d78304890a29f070a320b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd80d748c50008f386f1d508ff277d817eb15ea639663d2209ee247f0b366bee
+size 3918316
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d13be52d37cc7397e0f32fdba1f31574fe722fb4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a92f5e6c73bfef70c8fd785d22a853b65e25367e97b3b61ae30831d57a7009d
+size 2017842
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5
+size 1474064
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..65e4e5dcc7808c96f29fbbd1b6f5e1e44121856d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35b1bce9695b444c280c3d5e06f4ff8be02e85e2e7e43a74bece41de48364ae6
+size 1939711
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4bf9f52ce377915d0798d145377feed9bf471142
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25b2c0b520cdb8969aa1206942a70dbb5f15d9c3714d32838611b3724310b2b8
+size 2401635
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6ee15a4e4a13d07418e06e0975f15bfec1e4d2bf
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a91d3653bae2ede03fe2185d7783b16726b4fe5187a7b0d9a740853eacdba01d
+size 2863401
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..167e3ccbadc542e68dbe1d8a861ca6213c6d0807
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dc037941cb630d411d8961b6cae7e3d7c408264d6a2b1c57eeeaed46e95db64
+size 3326062
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..561226cfdadec44810e64e0ff58238721e047900
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1a7385a156ba1a244d0a68510d54b10bd99a5e484fd549bece27fa9c1f0ff7d
+size 2321040
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7b3c53cc1969ab83d9fedbc38d7e65382376b30f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d392160c895ce3ac05c5ea760b67f3b3e8096eaaa9c6d3b15cdb5ea0befd44a1
+size 1663463
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1145f8824eab3c8a326ad3bb41d0e8f79ce7acdc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc3a851765fb00c21fcdcda84a5e9af5a6f38001175b08b1ec080f20e0f3c63a
+size 2165882
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4034932077a19b2595c4594f1a86445227ddc1aa
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14af21462cccb77a3a678199f360d6a79a48e444ec39f25dbc1dd3ce25da3478
+size 2664995
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ad728cb28ba85de0afbef19d1a3253fba00716d0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c2068371cf75c786ea95ed10806c18308d6b41003d3b7f51b7d0fa11d17e282
+size 3164031
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a8866f798631d26ff06d0b470668eec026da120
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:893d4b9668b94bf6b8acd3b668292a631cc5e89f2ba26cf71593ae241d09038f
+size 3664220
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f7bff0337e4b04049254e432e2adb2d1e31c4224
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86e257c9d31bc42753d9f1b4dc495745dfc1fa44b982e3efd78d6e13595a7800
+size 2054602
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b
+size 1499064
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..76e3cd38fa2117bc43a70b30df76abe07b1aea4c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a17784f288799e8e281cecc8ea32276c6e6423a0c2e1d1dd2fb26358275fa692
+size 1970488
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e1700846a274b364e8d69cd0e8629c0c5cd8021
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5397a0fce798420a11026105314095ea8a5462b4127b43ade1d9ae82c9d71b6c
+size 2438593
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab5a19605bba9ea8ca13353e31de34f174846bf0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7c2a1539472efd63722be4fd7402968875391af8d92aaa85a2cf978151789a5
+size 2906342
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..327a9dc1638f909bf0f478ead60d636ecc4b8821
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r2_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c20728bb154d80dab88a28741ac894df17cf0572c24f85528e4985dd7ad85b3d
+size 3375059
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f933488f9807ef5f7753fb01325b7450e5d403a5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fc9e790d7929836551b715865bee847ced2941a43349aafad612f7f7e17dbed
+size 1169185
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f9f604dc4197fd22548bbc755c3d4bd0e1316b9d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df11b6a2bd191e83a34fb8acb6ff7507a64ae30533f69ce75c1db0d6ee852402
+size 1699016
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fa777992e20e90953280933a7e906b499c2a61bd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a1220e5527fbbd9f56cfa33b6fd63bab8500eb34ed7e68ffd14f181a412ad42
+size 2217781
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3437cd3379d5893e98b444760883e5929ebee3d3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6d53c878eaf8dbf1faf1b433de3c8ab988b1199f4b47ce4185b96347d26b437
+size 2730908
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb08481c8ebc36eabd5aab33f6fbf8fa743e05e2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6777ff9c2ff0d365be287600a6898dcc83ac0661d0cad90d8c93fc5627ff7085
+size 3248340
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5ab260dea0a582c82b8a631ed318d3d1e05dee27
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59d0563584aa40acdb18459bbaa39a0751860d599ccccad0880bf81019c6909f
+size 3776789
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e1d672fcdbf6ac60a92d81ed51468c75a660e85
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68b41ed4d21a3abfa1dd1a31232d46fc75a6461b6a55bcedf50bb1865e8f0217
+size 1421230
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..186a2980264b3ecaab8ef484abb00dee9c50a9ef
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da67c73ac011980037561c17c446a6d44c60d8070be0315c6ca47fa87924fc4b
+size 2061704
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..af4b3f6ec5684088dddb104785ef38cc95e5021d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bcd2e07f17d8001308d68dc19159223546d1ae9b520fdc6ae4442c8f4b150b0
+size 2689408
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d57bf5322bce77f25ef6e9dfef2242daad53a509
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9224d57a97723084d493a49b90e84f115578e3377846d30aa8ba1878b20b2313
+size 3312324
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de9fc7bd94980886d4679a0881666189e7336543
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dae3dc6c5fecc8cf342ab85fb9647172d6eab70ec055f34cf19b40576e0005c2
+size 3939422
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a0a54af73c9dde2e724a279fd6c739ff6e5fd5c3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe0020e5be458c7f81203fb3030d87872e7da30db7a466d27f7222f0d97e4064
+size 4577425
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bacfe380d9fd37886bcbb741319e1ef262856872
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2460bc671822cfd047417fa3e0c43ecab00a22e703dbdb322e3146c064eab37
+size 1187176
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778
+size 1730743
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a37c3696b67d05448fe3103cb6b6ca351f114b26
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abaa3933d8a97f8bc0e53739e48dd54315ab74a0e691919f56719e5cc3ee717c
+size 2264384
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ffdda5a7b9087780d856b84c4e1ea5dd28dea90b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13533d4b9e335f56ec9f2506976aaec85fb099da69b80dc738a8e5fe1d1879e1
+size 2792101
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..132c8eabc0f1549cc7092f40fde77ba0f07ccd63
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:609921a9e7a99ebbf5e09b518a9aeb5dd8982e81e2bd67757cb2784ab5f33bab
+size 3324004
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1f50aa29bce062e816f1c092eb850c30629e2d05
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:945a2eee02c3e62ac5cd35aeb45d18d4e7743c13a447a0685890f4df23ac7449
+size 3866892
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2a6c19adb3a82108e753bd1025d82c43d419231b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:321af9678b8d057670b4fd09e066ff449808ac7518e9087fe30553ae19aec3ad
+size 1369737
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..274062d1b2d3ff714229e80b56a8dc88c1f3b021
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cf61298a0c167634907e8fe68c881d752068294072793f98ad2a54d1f066773
+size 1958855
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aec6b38898c49aa21cf6d1f4140e7b3f2b6acecc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73810f0c7bfd085f902148d91104e83d59e42b35cd5f4e5ece77cc3327d5c9cc
+size 2536852
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..19ea3bfe4da4c482841c770646b05c47822fcf80
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:797852f6e88366880b8f8e32b5e3521589b410ebe486dd946990791598c92f88
+size 3109651
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a6abbfa30b34eda86c6bf1143d7acdbbe86aadd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4820e8b513a45de39d9fccbb389a85b0e0400117e2a540d59d6301e18bdafe2
+size 3686782
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fbc69b6564e23eec77714a0405e125005eca19a7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e969665cb1dbe6ac318a50e56de55eb7754fcf53cd553ee0c709e7e2872a4672
+size 4274921
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5bfa999168f087beaa5ec8171ab35d55877c18be
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4854bbb4916679a3652b795ce074a2306852072eb9341b2b6e26ae19d7de9b05
+size 1210145
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39933c2636ac39ed7b9cdb1c5a6051e44e115f74
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3f7d1d6d6548fc5354424fbd538b705ba9ce9f7bf9a237569f921608baf3021
+size 1760743
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8c783bf5385831344ddd204ea8e327f8a09b232
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1732203ed97f7f936375b3dbf297a0d91bae7b86e896c4161379107e85661f
+size 2301346
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f4b47423fbd433211bd527b9aeffc347bd5ec05c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03ecbe5bc0134f811c623e7aa94af3b8bd302a279280e42f99eba3f1706f55ae
+size 2836346
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..19607696f5d0af17862c3b1be625e84b0ba86f37
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:824db5d8a4825d15b84b31e9015e488d03bf629b5b61bc87c1f4559d0a9518bf
+size 3375532
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b433f86ae15afa43e987e33cd6ba5bbea3e475dc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_anli_r3_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fbece3984f87923f2dcbd568ac33edc11fa62a197e4167d52a49de2717f96ff
+size 3925668
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..217728e21965b77b0d5703b627daef916080b01b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89853996a83e967033c728581296950cf37992055f9623afdd8a23d39678f223
+size 1216777
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bd58b13886c2634efe738347f1c438e1d41ec41b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30646be0a399f5defaf92f6e21f154048e5e3ad43b82ace25299f16d1db281ad
+size 1670632
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c1a3ba0c106d5cd15b70e45b80d6957439cd7554
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:862b1b917deeb54a8e389912d0f1752d5b7e0299fff6c10843d3b995ebfb46b1
+size 2120084
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cbb9c86ba2a8af175b5200981f39f3929214e913
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:114d1cdcbe76462a83927cb9f05477de89ae3278c82505a40071373556679184
+size 2577615
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..568c9da442aeca690ba70c491821a059b55aa294
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cc35edafedd90fd62687a8a67dffd65dc9b856a987522e475a9705911dd3ecb
+size 3027517
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b8b16e36be0dbe8cb58f31ecf57e4f9950d16e2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:085b0f409faa9b5470710189b00fe7d7d80134d723e4c4e38c7d2bf3c1257690
+size 3479076
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e6fbe63297249e1910a8767ae43283ea988c7e8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c724a8c435931a51fdc32e8841f8682243f798fa1e508a4b3fb7e0556ca8028
+size 1458119
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..328a488f1fa5f85b0b7afd75e3804a8db8fdc8a2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd448fd5d0e0072627e30fd3370920d9beb27337e86fe17fc702f556801334cf
+size 1961304
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a66c2655ec2b2496df46328631e3e52d574f8be6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e4e73030cc0bc029211db7a271dffac3636d95704298e6dc426b2ad8867b357
+size 2458023
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..455cbcf00afae11a57ea2f395a32396703f9d7d4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b99680a5d673e3060508a13602468daeb05e8be3fad90caf818ae370688d4eaf
+size 2963594
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d5601eae157d5093da2a59cba92246e0ef5d6448
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9f1052c7220f49081a53d429a85ff4720b38c0b1e6216faddef31666be79b8b
+size 3461617
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79763b5a95423ba49be04de0b6d72631482e498b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84ff48a9e3f35a12cddf36891b36d1199316eaa66bdd1a4771e1907ebdfaeff0
+size 3960918
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f22835022caf35f975a597b8e1da37b4e5f40bf4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c68c3857428a494640396b0dd813998185dd6874622d423e762a67c698cecb
+size 1506084
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3b9ea51a47ccea2af0d09600104af4ea1c2c7aa8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88a97f5f441a8abc5307154c39d04a734c049d71414d238369b396a15b209424
+size 2032756
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fa544762e8a8e73d80fab670cb05f35683b9d832
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:279a0dff491be11bec5c423d1349ed4ed4edb4fc9ab372d9fa192bfaafea14f5
+size 2554188
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e4db4c999b87c963a67a0228e124a834fe1ef0a7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99bbae8bf1c6220d854ba7f318cc5fc39694c85a04294db329f15793eceb638b
+size 3084588
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7fbcafc67303181d51ac17cb7cfd58f15d1aeed0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d18c9ff5692806eb883ff809507b369dd3c558a11f1072b633e2d2666452949e
+size 3607140
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d70ca0c3ab011fb7d56e66702ac1c6cc5e82e0d1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b781d88b1079d0b4b5cfbc1f37b4a89b40b7ab49a11856fb33b4d7dc227a7a07
+size 4130796
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b73d37992b3cee3480c4c5e0cafb1c1c6de19af
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f35fb273c77a27e6f021b1f7dd08d5c3e635bda4b6436e1fd2acb2a4bd05fb2
+size 1202714
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..373e031c23e4d51c9a69af68c94d2b308e4071cb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bb578b60b12cfe4848dd7f0e70c2c58f5f9cbad297eeb27eb4830e9831d928f
+size 1638992
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..87c7420693972d6848cbecd24eb8f86c1f8c6c54
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3355f925c397abf5636b3b05335d4bdc4348bc9f244c62b7ef1ba7ac5a80ef71
+size 2070864
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..587f273d2414316f8447eb460d5748b03e7cf232
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eeba5c3b8f1f429986645476bb0b81e5fe54acf22b4e756c48d54cf1a8c2e8cd
+size 2510815
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6a9cfa9ab6137d52ce37ec6909083f53c4cfea0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:740e1e6f6fe28532b9878ed4c447025e934fe23a00d1b726b2f113a08ad02fa0
+size 2943137
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e11924f3f6e64620f5a86c3e42c733c742f2824
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fce4a1e750f9d1d04959f71ec590999b1ab3c5fbd3c95a5ac8852469f57d0dfd
+size 3377116
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14e413af89c71f8e7133f5af6ede15531889ab23
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d62b0768c0b24a15907bbe60c885eb92d54aea33bc98c6da08865df54f3089bb
+size 1187289
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8f6bccbb278922ee6a3ba4596bd965fb80ba706
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2c2fe3c8b527eeaa5f090d6e5d2c0345ae400ac01c8682f70f0df7c767af872
+size 1557600
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..33cfce76d6adb0d974f04f905179e68f99c8bf54
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96a5e5144a76dd8f222751a028d7d4c352c66b32c34439b864462f7b74542493
+size 1922205
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8f36b09d6eae11c6c2d873ad365e3cb5269793c2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a73c0bc9700dcf3f7623a5a50446ac0731838231bd16843ff34f18914a4ee57b
+size 2295384
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d493c0cc9ec3e7b71e3af216dcf198557e5fd3ab
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a558b5c6d96fc87eeb4d9faebe05055f8c75d654b65a0deec8f46c108dffceac
+size 2660928
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cb6c1d206fe5a27c9c64c1f7cb6ad28208e681ed
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_challenge_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44d10d02e8ae30d1fd654e0144b0a42e133848dc88120d09a98015481d7abd06
+size 3027736
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3e6d79be14da464be38f9c03352aee2dba6a8686
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e24b49c170d78f3daad9e5cf63a47f499de7008107ea355764b778dd8d8ca67
+size 2351241
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1ce8d230db9f23ad72ed275a1fef617e1e5daf86
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4908d48474467204422ce68f2b366c86bd6627281467c7293fddc1b87e2c6252
+size 3174164
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..efb9e755a7fe65a1b9a21f83c1314765c259a601
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec3475931f999c1bea27d61f1236dc5937858fd27d800b29b68fc9280d5930c6
+size 4008946
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9d90fffc15f330eaf271d98002a836bdcb42295e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be84b30c133c0bf19c5a08cb2a0da660ea14e30dc4f51a390411fce13fc69000
+size 4831612
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a62f8fe808d6228a44f3617d86afeb20047abf8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7cecc4407474253261ef3acd16becc5076f4c97a94540a6f28a021355730798
+size 5662356
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b4e36e17de16a0556b2cbaef678a0e4679f9676b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a65513fbbf1de5da9e1b718ed4665d042ddab21306971da47bffbb5da4e59f66
+size 6494729
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c0fb618f23c096f851068c7c28489e5bf04469fb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a83e0bb11cb375f5c82073cf8e71f5839d8ce6a27f2028d0a0c34d900fbccf22
+size 2745655
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d8fd452c917f39b656e5ac17100b7ebecb008c72
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f76a107049c84e6827a5aa415305dd7a5fd79305c945930d6b0de1241ac864ae
+size 3649168
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..814b20611951f158a5811c04e2c3ec59f3d8bc7d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9ff0b722fb72c267f38e21c15e294eda9abc67be9c1b0a42d7c6f26fd160a21
+size 4566726
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..049f982490e6944da25adb4c0f51f225360d48c4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b370c4a85e868628c9676285b925652cae70351eb183c7f96a72ad42cada820
+size 5470194
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e16db8e84280e824c375058f969f5b9faa49b0b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2ee10e2e741b1159bcf58249629a52e97d219c67b8286c56a62d5e63e2edad2
+size 6382968
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e36295b0c85fea516ad99a384c1fdb9174532380
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:018222ff6431b86fca3efe135f94f284771d9ed0018c61e1d34441f7e3016c60
+size 7295318
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..22687ca8688c1b3494bc9ec4b61d472449b49a33
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f95886fde51ade66f2357a51ed859359c2730cce7a3be87dcf1a93be1d30a8a
+size 2843525
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1302faca594b14749c8dfe488f656f872255571c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e6b07feb285659b4ea5831f85e3936875ec49f1aaa97c1d0230318fb3f2f323
+size 3795536
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ae7a9f38b8948820a1da373f5153f2944bdb1ea
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e117721f3b6c797cf0ee9abe80ed8d82cc153493bf61788b8c2e61a1f3a6002
+size 4763664
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a883f82fe1ec74484068695ff0107d6a141ec0cd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:badbd3d73fa7eacb52c26e79b6cb01eff1eb9e008cd56d47fae53e19c08d805f
+size 5716941
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a26a8f9b6d3721b6d9ec15e30688907dcbc8e63
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cae84e6218e88a5b8d5d2cce51fee1cc01bf4d9b3f922e9abeba6539ca9c755b
+size 6679076
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..61378b808efd43415d7f9cf513e027717b72c4dc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86d3b7ee931de76d37876631f4fe33bf98dfe1edf500329c13b1d6b5e32f0966
+size 7641262
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0e1fc5f1cc8bcaa429cbc2219ae89d1aeb9cb793
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9a75bb425cfed04f4ab2efceae8b05d22d54dce3bc32c113ae91ce05c2304f4
+size 2322732
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d552c92dad1b1363ad0875023cfb2d99b9b5849
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c888e05873b8cf258b4c15b2c8fa7ed0375c2f0194495ea98da021ad20c3e89
+size 3110021
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c1ba68373cb6ff672efdbb805ee3a7bf8e8ce51
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00c3c85b20445e1ec4325237052c3981c9cb8304581a73ecc51c655c0fe0d4b1
+size 3909160
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a0148a794b27ec43489f4ed0ca2a6bf6b942e9e8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ddc8bf32c429df64bcf379d6bd3632b3dfd1ac56e195496b94ff8497568f2fd
+size 4696184
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b417671a8fb77adc80257776d8b62fffc8282c8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc5c06f1deefb7e268a661d1c70834430a45d9fd1cc408814eb5d7aa43bf7ee
+size 5491290
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5523e39313be058d6397a9abff094d4459d5049d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16a8ed5119fc638ab8869a6b1f53c076ed496411ffeb13206e00d0a1f7d38734
+size 6288023
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5df85309f4739439cab5db393ca2ba8d78667130
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef37dee52bf7bea6c8200b353badb51e515f39caf521db2acc105819c8ac803c
+size 2196638
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02e4badcf645b443c2eecfe127e4d13a479f694e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68fdc531ae9885c33772e6b4027b9acb420df894fe1f0cc6c4f153d52c22371d
+size 2831578
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..62d2b901d1d16dae057e4bbbae233261299ab3e4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf5c84032d42ebccce5b8fdc885c4eac02837d708be7b26db39f611e5c6818df
+size 3480820
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f1d9dfc8294c204febac04afc7e3197d03b11a3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20f0f2d3d2c42d01c4bc5e82993bd982a384b3bf695a72b183b6e9fa70409e94
+size 4115638
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..18abd48023873c3fa6ef5aedf1d224e828319a74
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:809238acf0d7702e9d836bccbfe9e6fb50b09d9a16f497564c61f3412f92150c
+size 4759814
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a331c2035da4f3f3988682267abdd4b0a8d651e0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_arc_easy_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d0d122d6fbe5887a64c45f023c1574ac26568769fb8b97aa4278e91d54e549e
+size 5403657
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb80afa7a2d6b6d28e4d6e1b3136334f30d9161d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6830a3733c5bed6276d592b5b0f5039387d59b5bccf9a78ddabf6abfb5293709
+size 3641611
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..16a6745a02d6f9ddf493c7362e038894a2378589
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c25cb046672b82b66b38c6a83b27ce9c72db94516004999c7b82e6cc5e7efa8
+size 5656920
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4dc9b4ead03cc34b86c6b7a8f059da511ea6c8b7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8880709a8d07d6a30cee91e2c8abcd4923b7b12982e75631911218ef468f9e03
+size 7694069
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1db6ceafb221aa11c478bafa4b4971b14b43725
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74f2388543ded7efe518ee1fae0e3631c3f8a7dd370e4764a860b71855f29709
+size 9728161
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f48881dbda80c1e65df119b298dfb0d0f970cd51
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9518e40942d58ba6020b1fed2aa3d8baa94dd4eb2350a7118466f9a33fe4491
+size 11768781
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8e6582c88e2c8c8eed3c54fe18b1cf76c4f208f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_GPT-3-Style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b78f5d0d6d68019e58c8006334ab26cf93011d50851c4e5c41ab9a62198228a
+size 13790259
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da8c5b590b0528d2a85723ce5c27e78034d64bb7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd929a63eb499481f1db6ebd617d1fabc32d73332e5264d123e575484ba0484c
+size 3984656
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14e42f9a751508dd9659039f204ae3354dba79d2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6445be2a856074798ab0e3e7ee5a409f2e764756632baf662c7bd7070b56e315
+size 6167770
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a059a64238928d77f179751d07a0f77c08fff39
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1755ce2a384ee7f723c8fe3a69a28105d585eef0af084e09382a788eb0ff3f14
+size 8371858
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fc426d32b5cd36384d92338b65a7f463774f08d6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eeec0a891d71afe6fe3f575094d1a5cfc7b221f43a35e2fb4225e31949717ebb
+size 21145880
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ef84c639c57e6c3772868cb29818fcef42412868
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fe38e94e304e166779acff92019e8e95510faa69d5f2756bd60bd62c8972c4b
+size 12780681
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b448c4e9ac5cd567758ed8ff22957407bac37ba0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_after_reading_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e21398fc66f9723555ba631685b7a19c3b6f3409645afec0be79d7940c4973a9
+size 14969467
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d40d207590fe585e21fa97aee4bb270de38833f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60753d451d8b4c4fbee9fecf9a14420c3d6d35449c9c791d1b72de95f2ca152c
+size 4041656
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ce98defd31a44ebcfbf88f7e7cff868008a46bdd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8e724b410d329aa762ff7908aa725153b8a7c21fa7022fcb19daf34f3d23601
+size 6260247
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7eb9fdafbb39c65921337df9eb56fc6b89d8eba0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d8e1e4596423764a3a586c560b02cf0db57973807eca9caef9d6691970d20bf
+size 8500566
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f41f9c54f4c2ed9da6da489c4338a436b56acaaf
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e94b5ef0cb1056716a727f1f9bbe5a56fdc9a0fc7d21ce724116884cf6e4e849
+size 10737825
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..221c72ab9f4af3c3bafeafa5df76ca394c9f4121
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42b2becefc1b456d9b388c25eec2077cd030b4928975ef159f9aa3ab1d710804
+size 12981659
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0abc894c1862bd4e4bcc51416d772d8f8ab34eb8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_exercise_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec2c4841ce42e59753e6a0b028a4042b3992c6bb8614bd356f80ce2d35c649e3
+size 15206477
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..322d05e695a74c00ce6c7ee2ec16faf05aaac8b5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85b2faf344bc13de4e34ee3b4b15dc0e02b798cc2113c74603568c8f6cae883b
+size 3664689
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06fc34dec57526c7d0fcff55c95f0d381ced9834
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10f19195cfa6f0053ecfc2ac1fbc9a0296496897c88a6fed04f5f95563becf7c
+size 5687637
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bbfc0fa8328cd2140f574b6f8080eb38313d4bb5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce266acb7c6f3d8c257ea1d1356c65c22675e820ecef15ca4b6b3ed59a8ab23a
+size 7732749
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f58f4b123fe70bd0fa9b6c14d11637aa90cec593
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8b58612ec72812cfdfe5de9a3180f170b35c213267848cc6248a5f447af7b26
+size 9774944
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..85a8a105682f0d54a89426615e93508fb2d78b68
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:736ef017e1e707a40a03ec4a88ff62632b30746917520e7db66d9bb561619a5b
+size 11823706
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1b92bc1f1b258e9c17ae813d91b37000c0913fb0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_valid_binary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a17bd44c421beb97977ef1dcef852f47076b669f72c1406dcd6026835520d356
+size 13853492
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9640bc3e69c84adecb88832e048950cfc6d6cebf
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10ae601064d8b4386413a9ac20e0d2fd4ac123a56b4298eb6e60d13a9d03ce4
+size 3862398
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7715374b4348ad1785b39bb4526e8de1da6415c0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ef386a4fc84b20d97b20d7ecaebf02c38a079b0d953914e8dbdd4b2e89e1550
+size 5990492
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39bb5e96f11a49a8c4e45291c591239722a2d7c4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6761d3e5cfca5825d93819ada2342d9262a0a985d295d8289c2be391a7e6a7a0
+size 8142173
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eded82eae9ebec07edd48e98acadcf29abf5597e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af5af7250c1f29c8def91558bee2793c9dc3077bfde9751c6f245a44f63a3925
+size 10290490
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..152b7f1babd821d67973d2d236ca03ec9c0a73c0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b59b48d4a7d786ede344a2a927519b74add9711a261827cdc411bb50bcf159a4
+size 12445188
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b7eb12985b785e6935d8d8cc5f567cc42f08b26c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_boolq_yes_no_question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0508580ce303795aa6a2680cd9ec8eafee602933d58679d515643f2a2273640
+size 14580731
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e03370d12f96cad168ad3ae5b8b82a62bea4b115
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8253f92bef5731c757f92e678cd65671d3eb8f0e2706ad2853b654c536535de2
+size 55151
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..276eb218d7ef8845f137a000c3a2a227813f093a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91a3cfd245ae1ee0ee02f5f9e274de139ceb47e78ffc626dab00da4cdf208624
+size 77971
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5ce24cb5c59b62ad95c41c948f6e5d28ea80392e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f0237487b5334d2fac0214c836cee7ee4be08367faa090ad2da28dcd6bb57b0
+size 99569
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9c7be3890ab10959c8a983dec99f8baa2e4a78c8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:befba3a6c369038c6c575621472fc719afe48f6f1736de9939842f790635dca8
+size 120717
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d021ae53c07dcddb3fb986e89886e1732afdd8e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48fa361560f2c85e8ea65517543f59bcc9eb217ada1dd1277808e1fc9d251bc4
+size 142838
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5c6b3bcb4d67ef7f3ed887a0374694816bdab0fe
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17ad19b75e457ad09fe063f868b471dba8c2a623baedbfb59db070fd42b1449f
+size 163666
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..806b11a9c8b43716ddc929108e3243bee2b45fa9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b09d06d81d146b2e5697d769fd76799731df2c2d70081347d8168f300891d6b8
+size 66218
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c
+size 94141
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1fce4f9beeaeafd9d19e6837649939db8f930aff
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6fdadb57b3cd5ddef6b90e041e731c44f330c76741bf794b213687fcd1c9e5d
+size 120835
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d8951d7718afbb90f2a019f288217ed3d9fd41c2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5280873ae875966085fc157d375a36b83ed92f6a60a16e7eb55207d46dbdc317
+size 147090
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..66f5563129af71eae70152871d6d54e6f808d88b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3836fd94537e6681d7873b9456c548ec4e5159dc23c442fdb8d4b9850856ba4
+size 174299
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1ed58f92cb69be5525778943200c44871ccacaf6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aed50f931db46259ec5ddbc141962f4981abbd07c3d111c4fd54544452b05cce
+size 200216
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba175d6d6b689aa7a27d2a1afa73ac9499908fad
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93b3fdf6d400409f3f778be1df359caf7679a08b93ea4ce9b1e82fed2f8b6c81
+size 56333
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d
+size 79780
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a76c7fbc4a938d6fd7d87600d816fccbb1569da
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5a3fb0d706036714abaaa8ef4263c71000728137dd3925b280c002b4f011c6
+size 102072
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..adb39ff4c104531ac825f8a0ac4a0d8796834e08
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bce0578b3795ec62a4b062452fee94782fb8712a2e1a6719ea68ceec7080b45
+size 123895
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cbe6063cce66a90c699210db260891dcd1265d98
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:101d358eac79c7409396a8b3907766a110e17373b0be1a90cc936209f10080fb
+size 146683
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f61b21cd64488eb39642c008efe1f6e2a1fa9d2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfd669f8e7e70dd8e5df30fde8c6d1a01b9fddf3415292269845e18f6e4eb9f8
+size 168179
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f81b6b35014c459c0126aee5182c6b38115a327c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4023370076ab4f37f85a849d05febffba2b9f27ea7170cf79013b00e3868cc77
+size 63921
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..782a172a048e3b148e4f38cf5031477531a2ae8d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b2eb4b5cae043e0a56179e92982ead164db309d95bab4ac5689de5f60c7528a
+size 89650
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c0f0f43c2bcbc98c8580eea98943c3cc436061bc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9d5551b3eae62ea5278068cfa80ffd8189d0f9bd69c6fe14ec58c975cadaee8
+size 114089
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..67bbcd13059c84ba4f3900357a356b769b4261f9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:992f4ddb531fc63f9c465a0ceff647d5d269f0e78add2b79d4c8215cd8ff49f4
+size 138090
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a90ee6bfdb382fa46a438dea3fb9efd31755a596
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e331e2f3be66dee257a017023848f1885e52d3bcc1ca02ff544b3a849c7408ee
+size 163080
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ee785c67c9bb6b9e0376082b3501fa8a4699f94
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa92772a1f791689c4a6f40e7e1bedf92bca25b2f2f8583bd20f3ab8aeae3900
+size 186762
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d3b6518bb5ed5c99c7065728ac81c8965aed567b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:544d556eeb4c5873f53e2bbe6896f41d77061af5551752f00347fd615459de81
+size 57352
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05
+size 81124
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f3bfcfdc94412c33177c20ea1864a82505e49e13
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f44e89de2d73a23d74417fb39df30161daa9c14555707e5496fd53250887c44
+size 103739
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5da538ad6186ab3a5137480d730bbd6a7e42ddb8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4151f3bc999d83ee9e8525e5ae3c93ff9ddab124f2acb61c927dfc6e9ccc793a
+size 125904
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a320c7130c902c8877095c78bc1b139cc03889e6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d06567d58c6156808f2f00bb70bf32c37edb452f9a982e515adef5c03bd27dd8
+size 149032
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6088ba2e6cfcaaa770df8b2eff2da9e242dabb57
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_cb_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5110e31abcac366e16e210f85924f3d579f46b9ab09be9cea82c8dacd5be06fb
+size 170863
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3cfebe37869cb7fc61b2b4e02ccf52fd2e2cf91b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afd4f56562e3932a1a0b5436d81297985fa9a521e023716f869ddcd64a78905b
+size 184434
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..58d16f188dd6979b4302b88f1f3fd4e7098c1362
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a711855e287dadcc33caaa99f6fc9c45c8dc9e9972d6c27be5e827ce0bec600d
+size 111552
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9c4e8c71a6ec1ff5e94dab4038d1d4374672c277
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb3bf8aad93f1475575582cea1a4717aa385dbbbc0b0c1588169efcc3869e86b
+size 132095
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..345aceafb160cf66a504e4e9c839ff77481504d5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1701dcf32f6fd09b01afb728eb4fb6119b4ddcd07e1fb7835a86b6dad1ae4d11
+size 152231
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b191858448710073dabc87c9ffd293a7e6e014cb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e99129ac4f71803b68c62171850f8eeead071c9f9069da25fb13f9aad022276
+size 172056
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ddee87eea2e3e366a1f21fdf8391d7188e4780f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_best_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98622d6d7ea181933f1649d754b6394d2daa2a4b468eecdee42eed8818288124
+size 191979
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..490ccdf4881ff378a99390bec3b7a2f134ec5b38
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f29853374c1cabb9e8eab25f3d6ea645e4e47edd544cae95b319b79d8a230c34
+size 175860
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f0ad8d6de48940ad6f5f34d112282aac7d2ad477
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a0cbd267582535862d6040bcff331a1ca4bcd4b831cb9fa874c80ec3aca6a4b
+size 105061
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d8b5b036617e19594569c5e52016f70fa104a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdf9d3115b0117fe9e2f893f8c650796c0736e3f56b9b5779bcf1d135795bdd6
+size 123388
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..38ef992f0645fc016e2469db7322864a92c7f18e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94d4cafc012ad97d44fe09b6a1d74ff2433d17db596b538181cc86f1b02b1609
+size 141418
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae6eb141a7160249e752191a29225bc5b09e27d9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a1ac669e1e242e5a73e17bfb574e9b8aca7a2cea462bcd2326848227a5e11f
+size 159090
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0879b0df8031f807e3b46db4140fdcbcebe5ff3b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_cause_effect_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aff42b58d213cf44c7807c548ea697cd449d1fa4a8dced9b7a745fe1dba1ef05
+size 176940
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ee364c0c46bcb3994c72dbc9806f74df9a7b29a5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df7f28fba22976ace78320f292a57058317afc8c17cf1a064fb7ae0eecb0dcae
+size 170420
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..10de7a46d265566896026e9325b29f99d93c5a33
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db39a0d05d669a6f468342a2d4e1d16a66cfbc1600c989f8c248d8676707f3e0
+size 101236
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a9499e6ad3b15f08178b9f6681932287e8c6a902
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52181e2030202601db2341d14e33599307fa9ffb12e7e567496869b127da3ec3
+size 118473
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..51109bcb7f8b36dc6bada00dd76d37bd1cec10db
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:398492824ee151151968e40ca286000fd5dae07af095724ac936fc61b22dc1ee
+size 135415
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd077979c76ed8ff8f394c587b775302149438d1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:252035077e95fff8177fd99c0292d258c3e9dca1b37942e2b0ca11262a52dbb7
+size 151976
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4be4cea128dae7983e2a97559186df27eb6d29a1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_choose_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:536e2ac4bc10cbe2900379b588f448a83b8ede0a7e253f1c8ae95449c6fcbc23
+size 168782
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45bf12eea1601e12c8d99df6b8856ca5a0a55948
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1d2d6af2a03c2bf06e1e6e8dde24777336e4b1c6dd2387f9658d8fcf2f7a926
+size 193614
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e842d598a9e5fb1c0ed67e38f48f474e4e3394a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71631b545f291acdaa9eb604dc3c2a148ef86dd877c8d627e1f43c5868d584f8
+size 118261
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d565c5e9ab3a0e4463bd618ab0b77d4f9e1ea707
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd99c40b615efcf5557c0192d5ef8251f4177d20e26593214fb0b43c1d5f8c71
+size 140907
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2162750db8f23671d5dcfd930448a7e11b7eb186
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c6e575f54ee914cf78acb2e61a200fc04bf03e01f6822fc38e71fffc5926d0f
+size 163201
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f4423a96eec9217381a37d032867568dba16c8b3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cccbd8183682fa98dbb5b51b0fcae4c50e18ae52a91e5618c33e572ea26c9a2
+size 185179
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e11203f87eee7b2060fdde8373a2818838fc21d8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1405e3937602e7a875af5880bfea964309850d37dbca3915f5e3b3152bfb1a3d
+size 207307
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f99ac428672e07cbcdd7e0681fb64f5fc6643b8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc514c6951af9d602f1666e56ed9a08e1138ab8e2cb5cda96ee12ff9feb485ad
+size 191580
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1ac341f5b3690ab4b39725240f662b9c7a74c52f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90e061a1feed59805187fc553b5f17ddfbb6814f07f0c8148d7df5903cdeb14d
+size 115647
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f400053e5278a1bd241a37e2ebf409ac43f69d8b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e197998a4184e9e7a39ad6376c362762007a51f4923cab6e6fe6d1482043548
+size 136665
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2c47d09822af3eae8023f39b0df19ceb26785ac0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f14c1f65b3e4106806178b7479acd985b341d620a9cbf01dd625edfdc3eb9d4
+size 157406
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ee18a6b06b42d9e31baffc34190768b679efc7cf
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe40f5c459a6c84630a9377b29028ddfd154b5d282e56b34615682e99149db4b
+size 177744
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e81ade4be1d37ba00b141c20655481eeb3b8394
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_copa_plausible_alternatives_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efd1ca2dbd958f29d10a6ad97c072f59e21442e522c96257b449c42ddfebeb7d
+size 198338
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b473990dec504c4ca347d06ae5c0d404baf0dc1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399cdf8c6207d02c5835409b811c4ff72e0670cab0db46b21102cf313f79a400
+size 3492341
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cc857887ba3453ab8d191e46d8624faa32559c3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6fc8629b5de3e85dfd21fef4a963495f945749b1feeaf1bbb621ebf50692bb4
+size 3914184
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d31bfe77b51a4a908fb5ea374b563be0794fbe97
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:319a80435df7c44874bec3854323b7698ae490a427cca593b6dc3cfb7f7ad3c9
+size 4899419
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3b303322992a46540d0e5b57cd4e09f9b64724f8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4883d387fa450722cb1d6763ae71b4ef9623d2b765aef9b3006958eb502c956
+size 5837543
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..014f39e23f598ef721ec5569d6099df3bacdb883
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dcf87353479b8993f8aec3b3cceacf74e188b42fd13a2f6bf79324bce0b5a7c
+size 6771896
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4e22ba4306b75c3891cb69073414ee3d26824754
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de767eaf4f9b8f0b5245e0998fde41229bc2683d6cfc51e8b085a7d4b74b8e15
+size 7715623
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5f5b8a303c55256091dfe0595d8792a1fa48ce61
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18ed9305fcacdb7bd1b47a7a928bb56151fb25d04e82673749bee8e0e2a1fa45
+size 3367001
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eeecd3b5f13675c500dacd1df3dc69792b490234
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f566c6f25aefcb0309fef0d645b96e6c16e7e73ac94e07ce97e3a5c624b9c5f7
+size 3846214
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..408d8061fc84c955daca635e904998a834bc8986
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a1a0a5de8696337c649444eddaf12936f586ad5c437908a2b52fc693f7d1534
+size 4798423
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c64fef7e1834920197705b8d07ef2b1656aac686
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9bb1a152bac9376308e78768e9caccf801722d6b3b58e5a9657ddc7e9783150
+size 5712960
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..09fbd96a70266b2f8bc19d49c6873d5bc1bee69e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c831af6709accb013605bfa74b2f523cbdf0ba50ea4461bab617015db5f30b0c
+size 6622941
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..400f4b480accc7c98ab5e811e5bb829d8d4b8b8d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04d1baa37f80f5a24cf0cce9190ae997acbba018fbe5be2b194671868946d6d1
+size 7538135
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b66fbce369446ed3ce2025ef6146b3c6c0fe33de
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a976821c3af8653869cc5731726bbb973f1e7fd95aee25ffe6066ed1d6a7429b
+size 3869278
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c16055e1cc7a4555ff5ca3e422154217dac98e69
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ceac662a2e36766e18435d6665898130d3f17d9f38593332fc4c815b4efdc8
+size 9954036
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1548c4a128678e749f64b193426739a07fe6aeb8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ce5423d47b59d561ddf32536191fe6f3414189ce288ae16e824edc9dbf292a9
+size 12020130
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..80d436b810b8fe54d046b09ae2a1463909fc6f2d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46ff7a56a6944ae390c69fd8c7f928349a6b74df1d6adfae154bd9078f251c12
+size 7056971
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3f5e13b22f5e5e5650c70984c125a3fa9cee0bce
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb84de0d3124419043e7c99c722fa5f73097a87d17829f71bcfee7f72a387d4f
+size 8113216
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8166d09bd3ceac3377d8053374eb18be32fb400c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5500fef93f5f767b2a814f57b372ebc82129515532058e3e610afd2568ee1c6a
+size 9168451
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c2d789183a8d1da274919a6c8c581d5ebf394b0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ad83100f6400a62864a263adc9c95e7e2aac9777465b90ddf164711a32e3cf3
+size 3705535
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..536b88705a218cbbcb4629c78946fdcd3525db4e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:327563df38a3baa4659ec10e0a33f6ef2674a3354d550f3c6d48dc8889e7fe87
+size 5004314
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5c84bb89da8a49e5d22b6c08a833c27cf6ee0d34
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f14aa88d321675310a73f8af8bf0641348b491472470ecd37e332f1ec87d5ca1
+size 12193002
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e20dfd4383b0c5902c4620984487e3e4ec33240b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2de91ddfa11748d6252fd108715ea902b65f8f487cbd8e48b467d0d8aa648eb
+size 7180520
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..239b5c7b95b53eab8f370ee6f8d39e06d5868dd0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa85dc627c228a83493750465581ef46ee7a778874362d28a10b17c390eb8218
+size 8262488
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ecf41440535ad540766c65cbfe56640ae58efd79
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffdf2da82aefc3e87e1bf9c0ba8fb376e09478681bbef0b5c334bffd955b6767
+size 9352561
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..43e12fe031430bb4b17327c03d059405ea827d76
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c551da2ffb317c147ec01ed20b3aa2dac10baecdc0fad907c98f04d4ac8f5249
+size 3232017
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb8a7bbdc1af764739f12a6eccf7eebbb25d4cff
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba9eec1f5f7074b565b7a0ad6860dfdf8b9116cbc068a0588d1baf565e73d26b
+size 3461197
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ed9f647d781e206464264247df34711f0c4f6819
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79eba2784bd397de8cc0add906e07b7ad8d367ba0181c685593a6d6c232ab2be
+size 4250447
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d3d8c631ab534d8bcd6312f9ca73c3591c1f9688
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40e3d978f93f660f99924a313255176c2a4327c8154e5aee58aa22cc1891f3da
+size 5041757
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a1c1e50e5f6d07018fdd06b796221e850b71f15a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f491c37f605b8bdfc5a9cc507b795498af204b0f5cee058e8410090ba071a6b2
+size 5825923
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..834c8c93bf73139cdc0381da8408daacbcbfdceb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_e2e_nlg_cleaned_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e92ec48cdc3e95b46217d50e9af0598134c016bae47f6cd21e15b94327f279b2
+size 6622464
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..270ab60b81d46f69761df792606962c08e969ae4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb55fd0bf6e63749a270522d6d4301976618840c4acd589e3fc2432ebf8a1693
+size 2888086
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00e7e344b9378356ef9452e06511ee0c16cbcfd3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3476dde8265f1265d33bfc36da25966679977e321c3c6f9203b6932afd7c5d3a
+size 5084240
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..999abb743c7a1b53a5023fb9d78a302ee85875c4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b0fd18a295a330e4f3394743ecd8897ac77fc9aa4e59273bab8d465d9e2e403
+size 14699826
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9707e78d4a321533bbe4b0b22e6919780c67b28e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4ea7f152ac53f8072c66287d493ebecbecbc331cc463411cc00b39ffed9a8c0
+size 9656669
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..08f8d7c060953a4cde7256c57a5e3b00ad88daad
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d400f06589cb758672137b56681067d2840f83cbea322e8839f19230af0ca24
+size 11799911
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4adba37a50b35d4f9990ab724acd49f5fb4b1db9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d995e98bbe5714ba2d77c0a7ac4115a23df24da48f6ecf3207d8ee3b8d4b681e
+size 14076791
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a5dec33f93d42329781e9348fc994b625ec3d674
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a82a487c6b036e31cdab59f6701ad53b4c317bc5c62cc29798f39a66a7b071be
+size 2787777
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b8b20ca81a507bf86d6a7d05c6ce2250121d0e22
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3bb3d23b6f47468d95985402ccf370f0c8d6d14936597502eb9fd57707a7769
+size 4880061
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..03fc9c0300e93e9e4d8c7bd7236466ff17a3718f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b8eca26b6d69ab6a02340afd596e6423ce3dcaf7f91b100ff95eaf17df6d773
+size 14226582
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d2e8c780b5fe92d217dde5da70419b93f3b66c7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fccc6f14e87c3dc571435c991ae2c3f20ad74cb48029687309ce29dea64f0dbf
+size 9377078
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e6a7f063aef9e1c30eb895819657b54d98d637f5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b46f09b724e6d18ec72b273541fcf26ff24cd8b6ab1b94e80189fe1e01fb023
+size 11534792
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d706d7fbb28fa94431a2bc456599ba1322baf87e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_DOC_tldr_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fbebc378619f356e814fa00977d1a6a98feb06eb08f348cebc0142ef5ad25df
+size 13794621
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1b2adc6b67fa051ad907b3a3b3f0d3d1fdaf36f0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04dc2976ebc09248d08df5ed108177a2986c01fdddf9b396a4111938c40705b6
+size 2817489
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0fecc3b13124fc94d0c1ab524fbe60b831e8a419
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d87962220b6bfd7434fde78e3517d85f9d5456f42daa18f17aeae0371a31a760
+size 4992779
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e7702c116640fd0073dc08436b717ba9a78f94ea
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3df7e4cbccba15a6b84325caca8e77fecbe9822cfcb12922dc06f9a99da047ac
+size 14514250
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..abe799e8ad6a45558db35fb9b724246e92b36c5d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f8e36d6d6642cd6bba2dc9849a05c3cd62cfae4e2413ef3a7cdecf5fd1ff3e3
+size 9534773
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..75d5e2023b62378ac53339192f5d8c70d04684ab
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:851fffbebccc4707ead0f158b5146c95332eceed2a9109e63e8d768ae18345ab
+size 11643463
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..70d8a93540b2a638141c893e39b236ec0255e070
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ea2fbd5b3f935ee8077afd022031ffc1c5e70aa6c5025e227d6ef4a4b7993cd
+size 13897343
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3abf063445d72dfd800e11fc5e1fce34e839355b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:389abf71a2509d8c2237e60b246b86b40008f47bdd4da6bfd1cc30a784c6997d
+size 2821369
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..31247b187a8082abfcc82ff11fef241c06be7674
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:405ea47f066d2dda2490b55e6a2ee0d3fb3969c972f944cdc2ad282e8e158d03
+size 5005819
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..025181f012ff273de95502319dd88c6168b0c750
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d836209c3aef10d25c858b2b893dab292df453003fd270eeb64e2a0305220449
+size 14490578
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d197ec863c3abedd5e2018d65c90c4d8cd060b2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8efc6ea7f594d7ced6f97f6b9e99867d9d85ea6ca08af264bf8404d35fac8f2b
+size 9505845
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d73ea9e3fe256bcd2f5267047b497dcec1d85bb5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d231d7db5f3649ab57f7313b108ffdbaa8d69de334820fdd8ad332b81f6dc47d
+size 11625518
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2a747c871aef6c949ab28d7b7b9b9b701308237d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_DOC_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00078ec43c42efc22b0df2ef4f1dff986e2bc8b0b3483498fb15822560e8642c
+size 13882330
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d17495e405688f2112fd1f4a6bf9b047f86e58a9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b1a79fb26528c0b3f80e369add509ad59c007d2c85dbd26786eae1b043079d6
+size 2867354
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e745402674825681c4123346a1eacbb3895c6205
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e0e3c1c9f2068d299a377f480c17b1da573f9dda021deb6286fe214ab7ad967
+size 5072310
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..44a28491d72b6a7f5d45bc4b715e157863507d66
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cf2d4d285c41ca71a9c0e7d259eebae5921a4e0cb8eb1d9174b0ad06a6a8806
+size 14697228
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4ffdd9da242a01eb834e1db655357fbcf77e3bb0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fcbbe55ece0379bea23d3b079535e27990f94474aa69cded87babfc369a98b5
+size 9642213
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f31d0d9646a806cdcf7d0362c5032effd71d4286
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be7340052999183275ec96c54d8bb26970dcca9fefaf155fe739feff4373918f
+size 11748738
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a217da2165d0414d1aced729711cecf22ac8bb6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cfe2da0949ee4c3d9b5f6f1604e51320ba667abef7da0de78f05c51d3a3fadd
+size 14018632
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e18b96028a3d1c6d2d467f681152eb8a70c1a22a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b202f1905705cedef24671a33070af5c0a0266061c1e5529df03959252af3f25
+size 7278898
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1b0791a1dbb28e82887f35dd8bdeaba461766fdd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:491fa2c2dccba8ec2bc0983a9c4b74e3561969b6bc3c5429f41b244b4cbe8298
+size 2596562
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..932dcabd10fbbd5cc4f0db6b144aba0b54f0f42b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9dece6896230c8b3066ccefdf8cbce5f72b99d16ac3b09370ae67af2e53ded8
+size 6542852
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..036bec09810ae7f22b6e33bb7053f052d33b6a1c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac91f12916f4d2f0a1e37b68317afcf476325f45a32fb94d95d51ff6a9fdb7a4
+size 3946526
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c28d082889e0f8724537e4139e55f055c59e3bfe
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6858bc178cb67623174e8f27343fe3b0b5059c65000a2ea1cf1caad501a87926
+size 4640944
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ea0a4058f7e47e32ddbc7246da5df8250816e601
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_Correct-the-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd37d64f890e12bbf67006c1bb83d4fa9ef686bda280ec607fc4949109f6cff7
+size 5325714
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e99f0a94cf7194d9fa130192d77f9b9053d21cce
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba4d749cb6f1fd33a212e0d31b7e18f3e7ac6e44d37d636e641cf4f852372521
+size 2115935
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e8eb779b1538e7a8f7660d3602cc4ffb980a08d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8e7dd28d740695d966061abb0201650932c0340d7712e5973e53c09dfa11e0a
+size 2900909
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..66043e066f3dab938e43a5d1bab1b660b8f65b3f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8809a2b66603ddfb1018ff61f1b3dd4bd873b4340c9b348a5ad2d75bb337f902
+size 3683911
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1eceb5eba91de5a2b6a0251cc41d6598ea43a41
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f0a1b10eb8f501f6c2ea6f252702fe2fed97e2f4c04c3da776de9ca6b2274c
+size 4458372
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ce72687af159ae3989219df3b219574249582c5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6de44da5cc3976e8531ee7e2f86d05fe587d1596a80ccdd0af3a7b8e2c544d
+size 5254516
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..40b0dee7e22e7c68a3d163c9d7228acf73e110b7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d7a2df71290db63e61358cf4ac67ced3a9784b73ba7658086b2a27fc3e7c9e8
+size 6040677
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..50719d9cbd2a58823dd6dd1d3d4e26c99e6f930e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94ce73a65cd0b1abe151bf4d19874be65425d24f4a4436d30e736cf4d636ea3c
+size 6023710
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..925bc40cc9da556117892e039ce9e8dbe8b81bac
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a433292160214880ab2830836f30cbf6afb7de1a108c558fb521a41099e5aefc
+size 4830228
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6213e957cb85a05c32e4176448b3b9e842b57b55
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e12520e47bb7ca2176108f39b0d2b75c66d3461554b0717b8e610ecf71b08a9
+size 3880362
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aac9af81e467bb3e52d43c3796515b69e5937d83
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:306780ad0129085074f495c298c0b6223f82cba6c66fee3c999283b833c18e63
+size 2126265
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b030ed0aa7068eef04331c4acbb52a63884267bc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40740441cd52f8175558bdadbb7ef19410e1c294094a156a1d6e80e68599733e
+size 2406196
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..853f5d125366ae732c71df1e22ac1ad84cd7ec20
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_no-prompt-needed_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7dd902bd191a2457761fe98477299ee5f620f138ea360ebe4d0f772e119ebad1
+size 2690398
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0df71d75937dff1f302caf246cbb67a86c76826c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cc083cca0241f1348769f1647b9aeba62e3c090c47725142687bb7549cf3adb
+size 1864129
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1c69a654fee998d5cd793503124dbad2337828d6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d391e96b6951a8c7b0b2a292159bff1720b4418bedf5e983620c7015683ee41b
+size 2557203
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b6ca0dabfabb8c882cb9e3a5d378f9db9c731a9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d8cdfda7d190a2b06a36986a0031cf12719cfdfa261219dac10c764fe3ed6ae
+size 3248305
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1c2cec969e9923111b872596187e9f669a62e2de
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e4f0424c8959b3a069e6d16b22a5b2f181eae13c22f00f527089171f7554512
+size 3930866
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..72ed8812139471c2297f7bdaa1fe529200bb276d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee5e03015cfdd3a86dd9aa791849206a6e709239768ec347048ef893de3a5634
+size 4635110
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b885fe8afc4112d5a3bfb1aa12cfe8ae44bf80df
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_pick_correct_choice_index_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:970ce6efd013fc783917e339d018b7d1b9c19c0a05e001497862a865b83e3033
+size 5329371
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e294a3d700cbef3a067fba6c7023b248247fd62
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85cb6526692cdb85faabb04b76a1782ef91fb26ab4e63bcba72fb668f0b03f6f
+size 2257257
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1e9f6b134d405713f22f3736cc17eaa8cbd5c9cd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07f4b6c8016318c766f8ab6696bd2c457b15bb4b84fefaa76241411d87d3fa84
+size 3018436
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f37f01de01d1026333a2239d3af1293f36efc736
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55429f878284a9b982452e0ce2531cde33988564bf9983c3e5d64b58669b19ae
+size 3774631
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..22f6301f00168ee1087c08610c6059433bc1da19
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2881a678acf97e1685c2066e79d243fd82396ab044e052c0035ac5d6928bcc72
+size 4519404
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de787e9661d80d41c26e53fb982da0a1188d82b9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86aa2babf777f5e3fef18dd702ca85f904fec1ea297c75a5ec84f383590140e5
+size 5294440
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0b28659d5ea86d6d32b21754a4894d8f08697fb4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_piqa_what_is_the_correct_ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6db7e3b7a6625370daae3e4f584f0a9571e6adacebc882256f8e41c76381080b
+size 6055456
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b8149ba20fdbeff21f9214fedbed8015e75d64e7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:255c38ac884b65ab9abe00fb1806775bdba9b547004f7d75a6c47754d0070643
+size 639911
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a7ae33e543859ae2b1df90e1c164f67793ee8846
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43595274b4c40382329f27538137a8955e26b8c2b73845caf17821d22aecde08
+size 755064
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..26b620b6ecb3705fe9cd75ff41fab217d4071a89
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:262efaee52311a8f471859ce9aa21e6aaa4153202c5e0afcd9e6c6bd34ae5932
+size 871269
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d7d3a1971b9d8a76eef562737f2cd9fd3b923ab3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c515a202b292258cb587e07f1df96b763c7fb935123dc74bc5def03db8ca408
+size 985728
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dff6930059fc90d2703c212c633fb4cfa2f29996
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba914b6d394c162c5b6c6e431c1e6daca3ad10b3728fecbbd86ba17ce7cb7b8d
+size 1098427
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e000578a1eb95c2a17f8c5aeb56aa815cf568103
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aca90a7bf986bf9cf4af0e9755ab6e82dd2e6e5805681f989d9cb36f95a2495b
+size 1213581
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ff97768d6f707b680faa43eede5de4005d6dcf0e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef57c65bff36bd13b42aee28f4ab51405c0917fb689bfaae833d986da1a2c131
+size 1182435
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e34f82b5c30c682a92b8888f52e0ddbd634af15
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96ebd097a038fbd3a0b3c5a4e6d170f612ee470e2685a796cfd7061be7091dfc
+size 1779271
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..554e4caeb4eebee43512bdad0214bd3d3fce85bf
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:259052e1aa5687d4f397578bd0d6023e3d0b627a6f98297f5ff57a791f111ec5
+size 2388450
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8f7004d0bac063e8afcf7651c316c8238faff3dc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ffbf4dd7536efb8262e53f84c2b43f6f5984c36fee5d73317cfa91a031d1a14
+size 2973440
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9bbf67922139ba73fc78aae8d264c2644117f8b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f63ee4b25afdc7983e6b48d61922f9aa78bde7defc678ac88cba954805204c42
+size 3555802
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dacde192385602a2842b7be0ad7d6d3d314d3cbe
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Direct-Question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f607be96e24a238c0dd8fde83d6b665ad8dc43ca722ee5dbcde4d4c85e952f51
+size 4144622
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cea07088584dee05236cd0f7a68c7f19dec556fb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9fc83aafae4d6e3244905a5d71f189f065a9ef7dbb316c0de51c04a38de1d97
+size 1328969
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3cbb308860de1caa3941b2da9d416e6e5ecfd5af
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:084151706d319d6fc6dda829d240d5fa535d0d90b84e5260c258a3dd85b37160
+size 1524834
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14519b1b200b74d2a346772df49c429b7c2a737c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6072d092aadf05892a389a1f472238abd66d8ce5436ed5365bb12968960bb2f
+size 1720815
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c1aa810e7592e8ce595af239c41d8665508d709e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a3d03085da0d2503a964efe76eb2ccaf9c6199890d2527225d319fa27357a44
+size 1915857
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2490482c8e2a5329c7850d85e8c19e4f6faafdd8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d6b26f3be6e80fb0215e6a20673603b8a4d3ad6523ebad2af5487bcbf8242ac
+size 2107663
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ac86af2d8f98d4fefe4d2f479613ea88cef1de3b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c59c3a86b3b31baea3c4bf8d83b3dfbd7ed20121be91879d130d2273282100c9
+size 2301247
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cafd25cbd758f0b9486ecc5453d189933b0eaf1d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e57efff157c98b8314d6199bba2d80dc165b742e1e61409d24b9f501982b3eb
+size 1935152
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3f2cfae7405517217661dda5ac062a878b69d42b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c998d13cdadb037a7f68375af596318678e54356453bd8c355a1872c81f16ac
+size 2636490
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..345ee20a8ffd612ae6ef3f049ce14664dd5b3d1b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b38000867803a43b25e58e7b54a3c903d93841b9941cfe25678362a38c0e807
+size 3349548
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..177b6b2b3846abe0d8e2a9fa383acc096986ac6d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55f16bb43ef2c94c7cca3c9512357856b27f69d6cc669c2c89d363d2c0a31121
+size 4039308
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c320a1e17485fb589515dad8a9583fad8719e998
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c20539f73e3513261aa21c3398942b62e0858a505bf2daed3c57075612aa3f5
+size 4724618
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3dde55e2becee4e5112be47d7cf14da3bea82bf1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51ac49476e0d618c267a9ee9c1002f904a4630304edbbb40081d852e145a2867
+size 5415689
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c4ec387873ceab5db31031dea0a7f64c672760cc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09cd6595be0df75f06ee8b108d7c0c737d35f08148275a674d512530e4f05314
+size 1870128
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e33d9ba7b67125e211413e4b633a80231c788419
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03908ee11e6008f4d0653ff514b4712c3550f1ee8d72dfa40d2eb1e313323868
+size 2545526
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5f41a7db1d46a47c987362c14d902a00e1fc3986
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29a759e2d311167006a871f71d91e46539954e9433b15875d12c47f9aea8c578
+size 3232617
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2fdc8c2890a81f23a33a8c551344e81fe5491067
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae5a32a13f35fc0d4f2d3858be7acfae5305e0e333b09977e2579a634b590051
+size 3896196
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..95e5b9b6952ddf6df87f6ae0d0f5dc24c5038363
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57dc0f8a7b1f9502b7b3aa564200cc1855b7f661eaaf95e54737c89ff8d27035
+size 4555519
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4aa0b886421a0cd1be0da254a3961f9e24d79c05
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_sciq_Multiple-Choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c94ec4df7448cfa728f345bb46fb9b6ee5dd13ef39b0b258cd1cd22a3d53acf4
+size 5220684
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bc063b75135635827313db9400f98b38858434be
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f69fea79aec1e20b585f5b88bec140c20a3a1fe5944bcb811704cef8883f06d
+size 2213347
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f0762cd97d9af450411866b7e2d40987d817e900
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f8f525669f7a744745871b34abc33b262709a6a16bcd4adb9f75a1f8fe850a3
+size 2974868
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4af0450792f00f3f0543b7164c33416481574395
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e297016725bd74382c2b8269a2b7adafb2824b8a4e076c07ce86712e8543b54e
+size 3727058
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d568e3f59b90b91293212dbf4bb7e87d0044f4ac
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b446acb8bb3afaef86e07d44cf80dd1bda1d429dc929923df8e9e5cc06aaa425
+size 4478655
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..248b1b0dd63c6d9d2063cb52afdf38906c28a686
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a070a118113bfe56b1088b87fd01fdcda871b0c67e26fa3cb624b6395689eb93
+size 5230563
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..61b0cf6a84dc09594a6764ac4a32d2575eee9e8a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43346ff1fe66829d5ae0864f271e74d7de86f6f10db0e78c24d5f8cee9d9f700
+size 5981098
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..502db2b2fb901a0010bc0fcdc8cca676e4397f7a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7954baf2bade5b94e0809aeccd7b888dda7b701e8ede7782d0e0cd3cf8740663
+size 2361137
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bef734f13f953a02ddf76a195fbeb18a100f8e2f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b9f07dc5357aeb17739b584da080f6803f2387c570d887d5358b61524b09bbe
+size 3193555
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0c467e5b37abd85e892ec95c65f949fff2b90220
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7828f23ca165fd7cd402d8efda24c121900b32629376e507811966579688639f
+size 4016285
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cee94186aa6cceb3c0135a9d1818c88a68355e98
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea30976fe963a5eec7ce0cc734c0d335259072a8774e5e9c20e45e7aa999c771
+size 4838413
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..08251b66d0bb6315813b789e9897c9469671f451
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32247fef152841cea90a3376ba956d6408ac6315f81a310e4ec5103b2ee4a694
+size 5661218
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..63a1f1440d508f9c3ec69632bbbe8c9807ce83ce
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feb513296f6ada0b6b9399507459654c97bbda326ce2597e8b916fdf7c1a4a91
+size 6483085
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a0a443caeac4d671d4b4b763d6046daefb4e82e8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:878b6c9f1e8a5f755e3cdf1499217bc6359551bec22a4fb1ae9dff2da57acf23
+size 1879027
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f000f4c5d6ac31ba3ad28d67706b4ef9c52b8c9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a0cc4cf0a4426d48375f4d252d8c6b8d80a6c6c045843102fb83d03c6e64d4f
+size 2434890
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..002789302ea801e276df25b90a948c3995073567
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c2c2d0ce788a6e4bc322911cbaf18064f8c12e48e15b338a51084de14a77847
+size 2985254
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..82d86fdb28c7aa2a345acb3d2b93bd96d5e3d572
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:554b63d3153e16a9a219251d3a7d8da1664fc129eac0859adfdf178f6f9cfd52
+size 3533666
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbbf505d04b0eef7c495e96cd31e469561ae8ea3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33a9422e5b240daa74eb6b26769c7328d307355f43b9c970b9e496515bc3a847
+size 4082138
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e31671ca09b8a456779d298bd31ff650a26c0de2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Generate-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dcb97ad9cedcefc669ca1bfb49b64eae65990c5617f6e7b07abfafd85a69bb3
+size 4629127
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f3a25eccf964ce61e5755152409b2f67534a6a4
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:973af1bc2cb9e5d954c6a4cad96d9a0fd6a1dd231fac173633f9209ce77c27d7
+size 2371012
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a403f7155f7414fb539be45bda5b2e5ee0679c8a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d26cbf66864638cb620c75a523cc1b35e180e1791d5b2903ed4676755270e04
+size 3210745
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..768c158829c928269d953bd5e6e1f62145e9085b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a2174e7196209223fb350041278338d641e7151659049da5ea612e10a3968d
+size 4041391
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d77f13806b893c57d0ef925efb6d09dbaa96cc0d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4611c8cfd8a2d2d30cfe488d91cd102de088cb85e33849da5074b7aaa042f625
+size 4871475
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7999394501693bcf31d7d07c29a11867bda4b828
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b45292715cfd8110050918364b8bfe2eeca9130ceaf07857b4bf0d920cc0814
+size 5702089
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d52e6eac8e18e270be86623365cd70fddc336ae
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ed296abae095e4b85a6bc69c42cf9caafe1cff4e504d83cab5c5d22dadaa531
+size 6531251
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9d0c82647e30fdfd1a09181a4591f4ca7f92978e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6159dee9f221b4dc20e2a2c8edcf618cdeeeb08db892ff3cee7af5eeab3f11c0
+size 2344324
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..12906f8c8cd0df154160ba19e52af1a1d511ce8d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcc1519ef1c2431b7d7a74444f461539d1fc1df84fc0a0a39273be81fd6ff0c9
+size 3158283
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d8c4c92cabf63e35a81212f8154b9b64f0e1ff3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2807207c7be521497eda4a04fa9ddabd54697c9829b43a8d14e228282e4bc567
+size 3962362
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b47c8fd3ae854e67888ab099e9a3770faf19e15c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b3c84d876216ae69f7b9f35ffebada31cf95c2dda153cd6e33c3af77b916fd1
+size 4765340
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e14759f2a039e7179c262c5ed3d12c92962c76b0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d4381bd811cf4965c7bb793296af9c01caee7c3f3310280115a89d677514d7f
+size 5569650
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a213e7370a4bc544330681f149ea6319c1336ecd
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:105bc5084b320aa02d9e390eefb7de7c7fa0f0c5ce2129158479ce8b24495303
+size 6372409
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b503821e93d7bc34b4ae449e4c187b6de099e45d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52b318f36e336ef9bdf1ace3c8c02450f2790887636200d7cd2db7adda0cb7b6
+size 500964
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..753b5fffaccd1cedb76bdf6a2b6354a44a09ab36
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08e8e996752f73c4acc02bdb55d6a36508a6eca652236637ef8f6fe24d7e447f
+size 351330
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..17bf5079996786bd4b03f235af94fb09884fc919
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745a24cb9008ef5a7315d35bf967c060313450cf854f093bcdc25139e9b38c66
+size 449649
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e03dbd87d5cf20888c323240b12525d5877d7df9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58ff10af01f72f73431d751910575bff87bdf3413da879f6289e22d813c55c13
+size 553006
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..916c7568d4ad5b923cd17731fdd54350702b3e40
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af7a71b513cd0baa40bd3e000bd6e68cbefd4505a88323c386a0078979cf462f
+size 651617
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f575d890b029ef3741183cb9ed1eb038b5b946e5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efc54b9a8b7c9c518c50da9e81908ef8b00a55db9252599d9bba27f48139612b
+size 747525
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5930dc8a50209d85494564f44bd2e65ab98b326b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65e80a0162951ebe90217c383050d9da4972c800f8c29b05b140ce670455bed1
+size 586308
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346
+size 415394
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..987da700d9af56e30b61442c664c9e2fab17667c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c82a3ea4b8c33545c753803c2e9074727f0903d1f22acd94ca7ae03b4e299a7b
+size 535346
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7cfefa8e9acd1f39bd1fd02c520b3fce35f62a5d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e1f391b42daabba6ca4d7c099668d8b808d07890af2cff1dbc6087f616544d9
+size 660338
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..324027e5ab54e44b0f29503ed54e2fe512536aba
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd19ca46a7a58658208374b3e136772a1532ae99443e5a973ecbbe0fa5907c3f
+size 780565
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fcb153a3e86a6c7fc4ed4e8b94f911d3f0298725
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9583e521fde2598b787b3b15e28287283ca911b3b439729f076e5c771f0a50a
+size 898070
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..472ba4c60c34391603f28138d19212840b0defce
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71bb23371690ba1d319a8074849e470b4fa666d7957f85f10a0b5d1096defb52
+size 517008
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7f1e9f90edd42679c19c1ccad170abb8b5d1ee53
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c01d60570ff250e9697096049c255bc782a48c634cea2e102cdf550dc9720899
+size 363042
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e81ed628705b28ca2aa559723a01fc464814cad
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9980c9f18677042c1fea3cb8f5bf2957e13170e5bd36be675a6551822bbfd0a7
+size 465273
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..684fcbc1979db1bb47ceb4ab38699f84490c22d3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bb7f47e2d38f41a3906e47f3070b9886a4d42c2f666230e3c7543879b10d851
+size 572538
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..253f2203f0b22b8e9aa4dbcc380dd03767e0ae7b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69fd6f983c83b6558d523c20cf46c7d8901ebb795619fdf1732362bcf6c05998
+size 675011
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..31a4d9c3188f2a44237fba0f2aaf0665e213cd53
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_does-it-follow-that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44d82c9b4534cb5c277fa5e21ce741c103174c3048c352f2925ed8c61e07b6cf
+size 774781
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d1846fc967632474a8fca035d7172556706dda1d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1edbc8f0ab77d6cb445cd0ac899e134a5f796286f1d080aa6217cfdfbd624885
+size 522596
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d
+size 367750
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b2739223c107735a70d75c415c0a68708f3d89ad
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8674236921e9ee313c69535a4e31fc3a0511535c324331ffc6058f9cf60b059
+size 471930
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c5ca6f11fa62a6026c75749fe5851679bccadb0c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74d84d5a61ef145c7d2b4b0a702e91d3d6ec14ab1f04941e3cd3c2dfbe3c0ca2
+size 581139
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..08b85e624136bcedffe79699af2ac6289a1d06cb
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b1e43673f75d907d5285c34a329212e9e75e27a2df8b9018a82544df057e62c
+size 685566
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d2a3c1a0042ed1f10d30f6b46ac0b32b0568e7e7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_guaranteed-true_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f48c4eb2856db590779e4f311f59d6f0e977bc7d0c6bb990e181cd127238d2e
+size 787281
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e1b8c85647822e37fa36da789364ae774c2fb75b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21722e25e8d37f47b948cc49b640112624ae0d0240166c3e50e5a84942aac4fd
+size 524814
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f
+size 369689
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e78a331d3ff214282550319cca063ef119755655
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c9c4683827d8590c105fef9d8129453b57357fc72132f5515d570ec878c8c58
+size 474691
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8d284a68d9ff56b46db679bcd4e0c4e91e331e93
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2548af7e982080fc3405c88ad04a9b53b8012f22bb8cd935b9c9553e3a7a9169
+size 584737
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..122c68ab001e7602f9d30c07fb9115cd971456f7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8071b0bdff62af2a8c21a87b44413dc44d736982aff59e9527539f44dbf785ca
+size 690003
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..959e304537792be83d897eb36e4dc31e729d7ae5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_superglue_rte_should-assume_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bbf95e1e69d6071ecf08cef2ca55d16fe90d382143c1b70eb7889b3fdc01f2d
+size 792547
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8ae06e2d8dc88429a33e6ffa8d078356b2efe5aa
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c52a77ebc4e0dca55eef8f30d734dbeb442fbb1912ec9016ac0d96d015c3950c
+size 1039050
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..73b0118f39700c51833bb62b3f79bab0a7e28701
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:747bb0e7ddbc13c4af209c8eaac2d9b87c1f699dd676452b9c9dc105f36c17c4
+size 1300084
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ef1607dd4655261e7560bc0565e8aadabf3beca9
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3b1bc9f6ec6b47980fddaddf7dc0798503e156a4c9480e5f5e89315d3246ee0
+size 1561230
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8c072d9f0d0eb1171b3c022a1310face5cbd03b0
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dbe4eafc4e1727c622503172fd88ec7a1d1e9dacfea2dc56f071281d4c70765
+size 1822694
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..49650ff96d1201d57ce1fa4c364abb80a7e2a2b2
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bd5d455998654364582408124c2580551a07ded0969f3cf29eb9ced10e9a60a
+size 2082989
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5f0a199181e95910a61aa26195b5474bc85ec521
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_Replace_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ad28f9bf022f2dba02f3ad3d314ce6112bee04e8544ca45743eeb35b1639ee
+size 2343637
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..656fa31fd2c2373fdc2f92b48ffd6426fa3c482d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae61f0e4e6f583a000253712fe5d9d9ea3ad7391d6d28f6dfd3d59751769ec24
+size 948112
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..685eb1a89f4630bf159864799cf157ffa85b7859
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f98d33777bd5d96694ec016d09088696edd61b6a10c441c0fb395787c9a22c3d
+size 1180626
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a520ede68e00402847e768b7ee7c2549dceb197a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e579aff8eaf9e16ff4662904b3a51298af4ec678beab771cf943f17af88c32ff
+size 1413176
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7f328881dd688748083bb7525400f52484db38b1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db3e36529772379ff45b50bf7b4b3ab70d74ae27fd8f0bebded1415720269e72
+size 1646024
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d248da9130946bf64e6b710a5c2b215a3fd3e88c
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44cb78d444c8e3b472f8e20c5403dccf3129f7f144b306276932ad34b7e027b9
+size 1877891
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae320e25e8e3a42944fedf8a4f861a3c2c48024a
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_True-or-False_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccfb17d370b542163e164d7ce44910141ec8f3cf11079faf7778d24d570a7dc1
+size 2109982
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..86df07fba48cedb623994fd30c6f53b8c2bf78c8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d87621babc5f9b847dc2a9f3bbcb35265b57e56e64cf78f8a559e93436a35ff7
+size 1010058
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..53e5dc5d92e8b65e9880f09760a5301e3f77235e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:240d22cbc1636235776d951ef20502c99c1ac41a7236c4c9c46711c8c5e7fb34
+size 1243184
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3b3912375769f6fb1d521e8cbd1ba15a683e34b7
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e29fda27054062c94bb37e57b2ca26e14ea51c670e4a83aeac92111281cd804c
+size 1476406
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e5044e42349da374a9d1e215705da15559bd78a6
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10649aa4e58bba4a43b53c629dfeaa1d2f83b59573a2481b945ef91035ffd762
+size 1710054
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6b703240f871b383913cde2c49651e57315e1175
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1385a3874018d20f55ecb6fd42f661ea2244f9c6f788988b75852b899ac5346
+size 1942447
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c653d1ebf99a90ea9af1ba99b23ada0e52e659a5
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_does-underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1f6a932dde5f034f070ca72561e27fbc2d193c21bc284a258510968bfbdd0a2
+size 2175303
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..be189f849128bbdedcbf63298adec9482a14b08f
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78f53d8ed580d773d7d74c52651b1a560b9a8213425e7a8e48a0a8e1ae33010f
+size 969476
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0b2f9b4581a895a9bbb2e1f1aa99d566a14c127e
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8ae818e42921b6ba900457599bc807adefe73707ffe82014de8ca2e2d0990cd
+size 1205144
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b8b036133a060117808ec262a9e81ea544d07d6d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:957aee35c3bf0b0337e48013c0607b9aea5a2a38fd6fa7623a0ec93b576086a2
+size 1440863
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..462d6099d636b05005b4336f2cfc4de5e8978a5b
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fed5438c0ee7344f58c6e2d09976bcb5ea811acb2f5e91e12507d8a4f09aca62
+size 1677039
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..92a7a95a5606fbe2a2039371d056b8daa46f8061
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2e40843f1e7753f2eae8ac7c30612ba350ae51d9067d7a9eee27c4ed49d82ba
+size 1911987
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..963be9acc1da3ddce4c2d839b99d7d0a7135d9bc
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_stand-for_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:845045a1b5c56c79e51e36fed7f1f1b34a4bc09dbf1726b15619ce7ff349fbcd
+size 2147386
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_0.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c908ca5d7dd4aed5906c1531cda410ff260cb0ac
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd47c1fcb397e25c284e6223f095804c10229521de599c57561be27fff9f9ba4
+size 1016345
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_1.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..210b35da7abe26652e0664ce3c7c462ddfa150d1
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0689ffd69914aed4cb339e61a5cdc585bd6aaedc9cb8fe8c5c4b8c33863e6fc8
+size 1257081
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_2.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..386c537edac46ee9e75d97b3a38e3d32f2512f3d
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ae9931b60a0f2afabfc11b0b0b3fad9313ddce3c3a5c028e626bbd8e78bf6c9
+size 1497960
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_3.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d3a02bff80edf4086089dec8d4d1679c9d1224b3
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:589451fecc695a375e527a7e91c54045bc7883c31f13aa564b33ed79697d1038
+size 1739171
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_4.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b4ec6d0a5f749ff22d3373fff05af3031c1ea7d8
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c54263c997f1b2d660538d4d12b5f25acb73a0fb28129aca388f45677606c11d
+size 1979227
diff --git a/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_5.jsonl b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8c9305c82e95b3d00e244bd60f1f79fff7195d22
--- /dev/null
+++ b/4b284b17boscar/eval/examples.4b284b17boscar_winogrande_underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39b88db43197873deb2fc243dad93a78e3d68b917ed0a280f929f3b56763e410
+size 2219655
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f8a665c30c9fd2e4a6128ecbc47fe766c6c9843
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.30064593181269955,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.026879704157341512
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.06703651854735132,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0024391085403758696
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.26740772484207054,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004912155390010422
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.09060657840339362,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001978890806658563
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.028179323090803912,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0013431147572720058
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.11491018942638491,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0030470199083947793
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.039330605607392446,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011751573991520296
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.06406482680474013,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023017838075157946
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.257657677037235,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004746012740112394
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.08688274387361468,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018578198715780445
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.0634475056304135,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002344312215241603
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.24940060733167396,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004590470603510631
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.0854912841791626,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018955102722507356
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..35a47678e85bff979efc3619c5fb96fd5513b60e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5448014860907993,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.02354902424699172
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1388827638743681,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004513090850395629
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.30199833689345784,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004833949344945251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.15412558821549818,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0035554299152960674
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.06750783374322028,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002988686287368457
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.14947512582685754,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0033695091348626585
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07545981652785559,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002357043487541018
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12450746932788907,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003982256290142074
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2819898497969715,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004422334130125733
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1394146722460062,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002993354560745065
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.12750187523825526,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004097239372173061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.28488318300316834,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004471716742740801
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1421792554091385,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0031066004309073594
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..855163d72a1183db34adfa337464c6d0cf6b5a2b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.7494822896647901,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.045680162494902386
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.16651047001316224,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00503164302116687
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.34879486947967936,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00485184387176236
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.18686493440413654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004140752549886329
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.08795085013666165,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003446853896796009
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1803167154721538,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037373578885484243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.09627247599444268,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0028826866977524927
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.14757163053942288,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004365281542114343
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.32518786802891597,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004501197296979544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.16801781294867849,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0035458321298973234
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.15178195561235863,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004526255293354419
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3286569936174265,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004546078233004703
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.17158619573308823,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036701761982919448
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..03adbcc43c8ddc9ee1e75e6c9a11b924423041f6
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.8754863248245969,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04360822451403643
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.177665566463972,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005213159352117334
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.36388817345982977,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004942659870867192
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.19673949819066772,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004214599980941433
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.09644734799007487,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003610059263232589
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1890195299674566,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037165922348924205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10183935852735193,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0028397205254593197
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.15715897881351443,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00455057510376714
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3364361502529422,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004479392424459559
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.17564046576780104,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00357350787655052
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16277909758999493,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004769154955939382
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3412001797439044,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004532856553347463
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18055401062696452,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00373462611859697
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c67b1909a050c9897da4d951f8efc8e9f223911
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9609553724467281,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06219622144359429
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.18155711637430752,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005153207013059504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.37582358230783497,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004868478990903738
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2035685591724469,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00420720523051007
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.09862370157443069,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003550556891817434
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19877038562348062,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037326666334346983
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10691762381917162,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002903183299069371
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1583476022875235,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00437564418742707
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3452891933106154,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004375446165777104
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1801505100456093,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003508323835548118
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16428760554962601,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004585243637248548
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3517722828354071,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004454120443642119
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1856568180747029,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003673260642261377
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b154f667bd1d12895b8bcbe5e6c2fb069ab1bf6
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0578470733972574,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06748885712032304
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.19481471590160654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005470101363136162
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3790270896990674,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048619650256576654
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.21188453557964756,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004406461443397243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10948046927141483,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0038119634607733702
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.20526211579883663,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0038681925059066017
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11512570203467995,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0030866056829496383
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17018196029834431,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004683975046675936
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34939009051908676,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0044237864033617015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18813765770684918,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003736931314323814
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17731841714776353,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004926877660252843
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3557595779694418,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004485010413076965
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19423676766572612,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003908165471883654
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4a5aebc27306c38b3aaf6bd892066a0e4fbc1fa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.021230789148664757,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00036930334901350486
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.1458873272470996,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.001900441045033603
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.03584736696676297,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0005714702584391066
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.0007014836062982539,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 4.9656181708739825e-05
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.004439318110846241,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003216075158994171
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.0011821469183760484,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 8.251246944497151e-05
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.021217584488589816,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0003682938445043042
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.1458356603789047,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018993886668088465
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.035826640478609555,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0005700890073056944
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.014393803982682208,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00023960077177274328
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.10436119389152143,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001369730349153955
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.024445633867760844,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00037319784494180513
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 0.00619859112106803,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.00014355209252272556
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e4a473ac15f2544505443d7bb663d2d646d69fe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.40779764093890236,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006749470759799865
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.3339982498926477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0051660582576264396
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.32230266861303564,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0048281759328283125
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.19817732601937044,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004893873974067158
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.1573726221734629,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0039063568571845834
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.15178353399364186,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003601245697432249
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.3339851607013017,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0057395182257868795
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.2772342692279814,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004502719763779498
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.26354330056583525,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004061719501266238
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.355640558427129,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006060447943606269
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.2909276700332092,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004597339854441208
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.27948934790150487,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004243290285056338
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 6.113913674438396,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2845444346534936
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..218d627d45e2a358d1c615f07e5deb8fe59bdd56
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.5939976969577568,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00597648294614709
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4939964608830419,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004854390017630889
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.4849380642186884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004355815746994613
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3429952661183253,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00515916785443233
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.28183473457747743,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004314448844451297
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.2746788042383533,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003972200087077648
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.48926352683264596,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005477667118032463
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.41141002664463655,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004507582122597571
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.39871995851472236,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003961110866080806
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5218315934141331,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005640661046456062
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.43384665246421655,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004499937439083978
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.42346399930907747,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003988595875260355
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 10.760879957677538,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5042906708162518
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6274a93c2432174c7d1a0dd7f4f1a5fa5b2959e2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6019219169272445,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005604012930862085
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.5151330504249194,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004815894474393196
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5053991892967815,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004167102690049179
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.35018891335285957,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0049939012831142415
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.2979084423676019,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004361003950778779
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.290027555024941,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003991847005443158
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.49562886803557976,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005164938327908107
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.42812641981008775,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0045613651384270254
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.41633220740186677,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003931067918794591
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.52905000223195,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00528992708924929
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.45306067770178426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004541275855320315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4428486235419869,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00391944756063056
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 12.369211984052843,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.448852909648781
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..26dc7525394f6e19aaef1df97849612874140fda
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6140783854575388,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005431009115559044
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.5234983456818771,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004768197563181045
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5188091003949884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004001638126026132
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3616064816232795,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004809450409130446
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.30875562772999404,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004425066895675964
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.3021394744685031,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038954845230459974
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5044009302215042,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005013807703028974
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.43375082514527546,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004499271894605309
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.42595166532510825,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003777679763490047
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5413307307536466,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00514634411644012
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.461768303468851,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004504232243207126
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.45558938836923335,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003763313157684693
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 14.17484508960742,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5664456945961425
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..59fdcb515afcdf307f6a1e746e1a2165561acd5f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6191143144670236,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005337442887903016
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.5264732013696803,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004770759694377333
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5244606185874572,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004015174635271785
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3665837047538975,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004873489688947118
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.31075640273894506,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004332106658701811
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.30622163667203917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038706333867826303
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5082660653417226,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004949576580934907
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4364109573951558,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0044637214108884105
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.4306090940048975,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003740221372195741
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.544246848734745,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005082580961481355
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4627900854261117,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004456263162285971
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.45924248488226455,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003747137495910571
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 15.295746341312748,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.26791338636163015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8255daf913380a1ef4214acd0f904461c4d85606
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 0.5105145423402562,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.026171756855261356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.05037228727550738,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001521468424950354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.3071218918316016,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0043691999674182
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.07765807357653474,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00157051415399817
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.012731823828237326,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005953055496072427
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.08372739502164273,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003169064201453838
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.020127229000738652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008029287449570219
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.045538158272609604,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013832440826860991
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.2891643639944347,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00405859491152379
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.07063525535553608,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013364210821022566
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.04196005053080567,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014483655274799454
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.24958331003009082,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004179855124707589
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.06378666642891567,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001443233877006769
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..484f3b26c08e5aeaa66525bcd4048b0ddeb5dd32
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 8.939610966084388,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5491296743636112
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.5062915093921636,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00623855847389292
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.41058204082882704,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004939933475915485
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.4021073167056978,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004419809770211547
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.2697944089487611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004958802537142803
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.2134665719582161,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00396690561907614
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.2086623721496119,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0036579604366172988
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.4167417293385908,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0055300014427683885
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.339652862796795,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0043904468358186306
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.3297792565875231,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003854890830252956
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.4428822362875854,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005748715854833886
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.3575821685337907,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004450913161326163
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.3492594208071959,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003952122386520726
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fcedd161e7bc70c90dc34623ca508b4f37c3198
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 13.301890998672494,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.28505139412189195
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6208153521814332,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005474019869143439
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.47992261425880245,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004722131355129768
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.49670042513083745,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004080443427810968
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.36134920028292394,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005006394373239957
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.2724687776194162,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004121139027181532
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.2820177000325173,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003846612306357234
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.513387382273534,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005057388660088079
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.39803668729645497,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004363439679150111
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.40938950141860875,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003780074803334432
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.545270404965477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005188121414246482
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.42076726693509325,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00439492850178596
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4338077009112178,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037912006207288774
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c707167fece6ed98b523f1056e318d424e6236ec
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.016710673793215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.239430646843486
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6350567970797427,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005217582777531534
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.48991548026332876,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048380103610504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.510262226432751,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003962805128007689
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.3768399298727737,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004940475252559112
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.2846474758377686,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00421808227846276
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.29589604538503367,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038655166790178765
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.528667040055799,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004991662892266718
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.4078265767945971,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004463504554334357
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4229438793809449,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003751130990106598
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5612664782836534,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00506727365699781
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.43042919293423204,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004457143541786356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4476311036451381,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036949691242316235
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe02ccb485308c9cb7df33e2c339e3407c432c7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.147226334370654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.21991183144780196
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6438899385054072,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00518110126025396
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4890513089689242,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004794116210959723
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5146917306496103,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003960214700402593
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.3827536576685021,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004895445210280526
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.28694164018374313,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004276215543973811
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.30042048261651383,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038878273195587917
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5359166963471615,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004878807275767118
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.40806729713008705,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004444151439522105
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4272118078716951,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037310099515515628
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5685304722579841,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004971555014070406
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.42853658116271176,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004446374648146792
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.45115401432632946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037186979256893797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6a9413d0fea5c26bb400c249e4615c7b7d17a87
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.415231653189736,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.3973135244971607
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6491599899632405,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004964212601778852
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.494038148373054,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004816716712730207
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5219352094405624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038752976995851656
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.38903348514620983,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0048629347647574355
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.29268387759461634,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0042689170185173505
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.30681215749599383,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003817111253434826
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5409007074057642,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004834940468527818
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.41236919893457097,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004434351880386135
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4333675825221648,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003667286185960674
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5719383870371011,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004840938070004957
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.43375185696655677,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004452540065472888
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4570999768435371,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003618688980073504
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..791e2ef6ccd24ff6d67bc6f9730be22ff8c46698
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.04292647255629735,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002183817456220973
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.21544171884055693,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005550023566245474
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.05701313684996747,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015578272366606155
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.012442143378991483,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014472589586154872
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.07244352525117184,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003313935921053701
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.0161244583453129,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008645866446531068
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.04051731369178037,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00211528013278168
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.2069649299284769,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.005347874387449208
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.053695049084465485,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014291447250690515
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.038419638740722094,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002117434826745668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.1872359151843902,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004719207977946223
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.04971651734495408,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013529088641741497
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 0.2878774611129946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.02191645372253148
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cc2422bde942c6b412d69b98e93b41db9d41eee
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.4127938000153225,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005947870279386254
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.3825281448261863,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0052859610720223885
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.349736522766564,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004520392025115228
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.1913215925740367,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004557837262848308
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.17961092041143203,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004131524292644386
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.1603801895545452,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0036128830354525514
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.3375779972211795,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005167676099647845
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.3167769212968609,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004653884254645602
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.28533791057700214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003862887782280124
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.35802129740566746,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005342789406951055
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.33193124379512834,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004717518923135464
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.3016787877742329,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003965250670159132
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 5.985300509654477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.27766678463599664
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..16b6e22a893dda07a21cbbab2a9b192eaf1165ef
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.573178421066209,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005581271283388568
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.506749252047594,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004824630423734974
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.48887346865850884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004124343604769924
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.31664607130401096,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004795448811178974
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2793566561351489,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004295985376268616
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.26639799997553276,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003828474192856928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.46571012037778253,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005055354594170153
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4153448849912012,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004462294046186775
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.39656298401250717,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037775849152242224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.4980514939537973,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005216216980737215
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4394122607391806,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004474633124329668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4219536083199477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037956133967700628
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 10.836147959434891,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.4439636489991431
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..75b716e19751a520f89bd7e67c273163708f956a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6216298638120148,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005214926213611381
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.5151245895293591,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004798742376668426
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5199191464585726,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003921506273107964
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3561340758337231,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004856631792498089
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2923563322177373,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004275271282396517
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.293158556124659,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038860344856338604
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5065253572253645,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004906755746264397
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4226271623332692,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004467796136304541
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4231221543799515,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037057307812666143
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5414810935939267,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004944762049361074
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.44852332402277933,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004450097989753056
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.45111782021645674,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00365315425959574
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 13.831489814005128,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.17913601909321958
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8996e1c606aa14daab945a10111c59e5435b68f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.63710786327223,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004986809833607892
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.513518509009756,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004813792451748782
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5282464366263173,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038579340284982727
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3647903015535913,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004731428227236603
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.29336775947257837,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004290478506270675
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.298218326766099,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003802989738412845
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5152749151762644,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004682500347428617
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4178051528960531,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004473683244984259
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4262980387789171,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003633393223889575
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5529446488559734,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004785169149105944
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.44435272882390914,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004455322990549037
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.45588243723533217,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0035980738333845344
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 13.968560539400379,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2438004666871702
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..462b0f2ab78ee9c3e0d33429c5b07ce9ebaf3d85
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6467284283438648,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004898296462472164
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.5134564578559546,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004819956408111058
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5315882216040946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0037313655895080174
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.37350249398383,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004794722873620433
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.295869273851471,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00432835566336737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.3023749154277527,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003782427307375681
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5269059518418968,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004687269593553424
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4216944286006577,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004521154789292203
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.43309432687633015,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0036123106429110877
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5632821205544838,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004755701765839582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4464070974868443,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004475800727796388
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4608334361195571,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0035311597997223576
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 14.284859993058577,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.3412668530317404
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1625913b7b136291efb19da0a60a78328c809616
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.09494981088402522,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019916895592894824
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.2228998019687395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005899199302218869
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.08842339112625007,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0017984454249847302
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.009491868531725244,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005473549381635728
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.05937321877500494,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003062549575730396
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.01550921123717699,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.000863216419957016
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.08719336306054104,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0018850492893613022
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.18578635977756294,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004400753558487686
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.07629382730868248,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012776155056859815
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.08924979026386516,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00198243157294504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.19477434399246124,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.005308381914737892
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.07940975128762484,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0016387793706256538
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 0.14362877072282437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.01599760640382167
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..19c550a779e10eb5b05b9ddebde6292e69e6d3bb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.5112682811495951,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005540977634393473
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4579965048546065,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005165963633977804
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.43390542470548804,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0042841473342437875
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.25940829162472157,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004518501612861277
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2356293403240199,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004304856124437503
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.21795173194027365,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00365577553521092
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.41368010519825554,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004946748631043231
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.37301320326760096,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00456992163294301
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.34935816010638454,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037022423523572394
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.44024779450138224,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005089317446235496
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.39323718964491866,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004624020652712365
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.37067691947406095,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003757092658851965
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 7.860939478648252,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.397136024152674
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad8c1df3b7cc7021aaa2b9ff0c06bc6905e1da74
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6154756678079366,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005226356611902434
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.48775772286111885,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004918534040367806
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.4995611810480339,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004047233978311527
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3398699521217031,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004795740723998624
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2695930310727578,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00425732564977117
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.27278137993599993,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003848244890407377
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.4999239471058093,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004837761355126683
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.39933195066497373,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004494917857537742
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.40537627802968923,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003711457442747327
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5322953675512873,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004983631285334954
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.41919592625717844,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004479048342642623
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4283890702376615,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003682929524417652
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.185026341639746,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.26708696501401935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0c54152f76975b544dea7231c1e2d22e4083726
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.635650445531571,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005052022120074668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.48710114427680495,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004948848380530543
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5099797708749086,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004002650362686148
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3589984262150251,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0048015517732940835
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.27518191624261584,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004323754355044185
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.28524387854268946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003925040473289598
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5130845093457147,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004695868884598406
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.39623149975458216,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004526826786948583
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.41158287666032306,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037209996181527677
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5476870363212731,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0048318445677514536
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4179872547044159,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045119446034739595
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4365260865248515,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036764639046590435
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.62634123629319,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2884546742216957
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..157f36a046e8b4793bfbad2c1ae5f5727851e0c2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6548086825341891,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004844894071962483
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4885276740034546,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004992206467831831
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5189004515187761,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003932134113995795
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.37233831020226843,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004732811866114619
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2802554209845624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004455143887198717
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.29267546193354504,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003899222651754383
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5299274777087314,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004612940099912932
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.39803918367090574,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004579736956902424
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4196684086168609,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037106616792705995
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5650146079911865,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004705605198203453
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.41970914366117246,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004564652484857213
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.44492007481859275,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036525630101091125
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.414902042012542,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.15981954068986562
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d9aa6c31635acd4c1073923d0b15eb852501094
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6577563774012802,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004766886700582482
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.49129937789940686,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004940830750132916
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5217786379718019,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038125394534580656
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.38214760041810264,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004673723142913314
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.28616105569647965,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004324909290729554
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.2989602811652376,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003725211805563152
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5362327775987002,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004544774276661362
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4035564959053218,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004526430360279699
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.42517200603030053,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0035797978808816975
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5690808761307263,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00466882829818349
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4240291676864204,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00451863928482515
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.44858456556468956,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0035259657795515694
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 13.109705721827863,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.32959907803487387
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6868ded1afb5f104e829351cd8a2bf96e1b63c9e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.15523872132511843,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002548593592832838
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.2578540135540898,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003573621223967281
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.17691111022689643,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024409979706619444
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.03670831515447064,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009288154609525573
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.06435957416412438,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017209958971101799
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.04278206373018957,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001017383953281967
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.10782482061267594,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0018535006806478674
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.18517259988551665,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027120363581219203
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.12316699898062232,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016560558398715924
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.14436386999444625,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0023958601327950585
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.24020355278235528,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0033575116915700192
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.16436893494623508,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002269262360679833
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.6688327716635425,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08895603768971656
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5a36dfc433abd1dc5f7eb19f51d87fd01d8e65b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.19251466904858092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0025018512878404245
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.26482791100683756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002991401125215465
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.19817292293453556,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020508834598208367
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.04273398024706325,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011142605916336127
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.06063785632750607,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015272956454286252
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.044030667142745485,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009919231933280177
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.1394015875344974,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001826742692456596
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.1940499587002955,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022281816682500535
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.14314934581168257,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014037837246694274
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.1781067800042176,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002312722439135384
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2461164131763784,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002792558935536892
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.1835790913887019,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001892477940762234
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.2866775641263093,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07082070203339437
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9fccb4ac6bed64b0aa06d33fd2f6ae04804d9f9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.22622981875069514,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029442678786812575
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.27051388062395576,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002891869201932646
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.21440807047777405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020452435106786196
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.05650119739076315,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015075926525400492
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.06519053099084247,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015252802022823956
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.05125393540374588,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010901442769842603
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.1655500657679839,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002257578647509859
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.19876519282475,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002181086524739298
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.15582785215592246,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014510195081137937
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.20977427939720425,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0027791163801520113
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2505198544806778,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026988444846593263
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.19828406308602203,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018921021547870405
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.6575458090890987,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1041271763194214
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..03faabcac2df3bdd19cc746eb67630499990abbe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.21044043846517407,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003415454177959976
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.22091950701260638,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032225297099515117
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.18231062776327114,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002382883544618064
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.05593482165530236,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0017382726978577543
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.05513971414698624,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014788145878236471
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.045684759403535856,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011288709968297803
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.15945923460088876,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027306827278561255
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.16575564694153058,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024709668753929714
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1358859920094012,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017633408647579247
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.19655719112323583,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003246109842646873
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.20527494742011107,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029956447445675
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.16944346876676417,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002216821657918166
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.6322531721634923,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09664327595514159
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..276ed3cb77148286b34b4262edcfaf8edc3e5482
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.07543339925751304,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002912744472352012
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.069662997425629,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002574832610229587
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.05957194176610327,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020806329854399994
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.02006705843647472,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00126274734686985
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.01752499067978901,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0009388325078673943
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.014937212652877212,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007702355493743181
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.05854875141211828,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023530742721753948
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.05294028592073027,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0019759992033135617
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.04508232351505334,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015754425551504734
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.07005378436649634,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002739806542216889
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.06422067776590708,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0023755736065222944
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.05492996905505073,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019232228948358645
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 0.14536104754565293,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.01854835494441183
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..aba2d80001cc2bacde130dff6986bf02e100996c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.014365581022065697,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001560512259717543
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.010914106815652429,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.001117758970327415
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.009728718774236622,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009456594824355727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.004079235472292088,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0006970061514227833
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.0027410666892011626,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003708439715072337
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.0025962149623228593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003597331522493365
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.011459758165628971,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013480152992941183
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.008202875925360011,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0008427013537304956
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.007361100442935105,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007285180962432676
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.013658620509020312,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001505727520339385
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.010192549845396262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0010464774903728251
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.009121295941974078,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0008902015179642384
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 1.4572877711432085e-12,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 3.3364596700068064e-11
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..237be0c0c8d4381da60387bf015bf16d5ae8002b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.06452403733371909,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0014967240435340412
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.08494571623153917,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0019362539528511185
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.06517623892397213,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013838699418831327
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.007840253016222527,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0004579579103774128
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.011290392212911112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006492975987186066
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.008217000605831354,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00043087034048410583
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.054902554802216445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012088332725248313
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.07398900765280814,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016591560862410744
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.055895886010082024,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00112519759543134
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.06060617538968794,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001392308983087042
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.07979703481975739,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0018051602334693293
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.0611621603763303,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0012850318876890584
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.4889087616070698,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04970736558027736
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a890fb0126b5a615ce0f23516864cfea24728eac
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.1151865879029384,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016250307877578995
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.11071719364535992,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0015355647070335555
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.099574793692519,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0012103234622713214
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.005991291271737141,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00045282560241909244
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.005418015321491166,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00038092535887569517
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.004883245190477912,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003120388028651935
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.0927927612021365,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001300637168444661
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.0891743403262336,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0012125027041917508
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.07964868005031557,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0009142347526244936
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.11128782906033827,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015558741024301066
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.10709447290067853,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001469683871656218
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.09625017841568764,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011571719141370627
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.38700981358791664,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.028096574852327708
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc116b15fcd7664ee0c0159760849e1463f98fa0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.18923293051242865,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003931215744109829
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.14888789271257125,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027634371759885064
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.13402973283377853,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022221428328858175
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.05250476162006954,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0023465752364279506
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.03373827776460467,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011812065114166165
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.03139270460463245,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010530936216454475
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.15396186822994368,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0033958787799507834
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.11797169313356873,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002188824119755982
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.10624803463455436,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017616455857783213
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.17982708294304905,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037912824701119793
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.14035126579839385,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025962250169538686
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.12646589294514818,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020915649804801885
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 1.9793617937243397,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08162245357603662
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..31223e38ebbb50df2d91745e800583b536b15cfe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.2111603613464384,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004316562776200497
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.1609555599016601,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00308128531187493
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.1439432413777907,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024332384864776345
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.06549752110515275,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002622796804725408
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.042567827326622294,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014369849379045625
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.038684529531259455,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011859610306448835
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.1729592486561157,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0037461636917488455
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.12871277703565398,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024988677821967836
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.11485216819737429,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019389059734766416
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.19990549390147885,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004161498874034307
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.1505907917034171,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028923842560353457
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.13489114533789145,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00228554203025425
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 2.2238921561637253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05819769717187897
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0419426f5915f39ec23f5b6b65480bf678152e60
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.07178178912956662,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0030761995624091995
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.05474694667796429,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002299511195674342
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.04800715721597718,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018517108845868325
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.023714942805799485,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001754604095092442
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.014969677658351763,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0009231868199116232
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.013340223113737474,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007807829472162995
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.06022721464718216,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0026905863744203424
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.04482442172314591,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018971481069257855
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.039101876969439266,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015041565892311172
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.06748108589924676,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002932963122342718
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.050561649482659246,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002129232058478221
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.04451495139778381,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001720795547452951
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.09176000483883211,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.018538430182798854
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..89a169c4d0d0aee22cd9df54c1b9165d694a0029
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.011616993456994514,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0013805524814779022
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.00835448439931725,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0009502784109930084
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.00754486001636464,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.000823337338392106
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.004000350645166246,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007784970288200918
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.0022515189052559757,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00036203795760830825
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.002198804033049241,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003741043115897432
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.010070274369408232,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001239457603444862
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.007063094669971561,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0008091196938805151
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.0063189425177954785,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006911151227391025
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.010987649566674518,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0013277168653009403
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.007790571198885496,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008903647625496989
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.007053847928732188,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007783090333692333
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 1.591388902748712e-13,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 2.7248641435081037e-12
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..80c0e7c20b3d62a5034adf000f18058529ff478f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.05003590805872101,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003008991351388589
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.03897698027943584,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.001807082535411678
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.029020887729209358,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0012886185406965965
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.004163791361437201,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0003541290963917177
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.007124397102554354,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006653760319599128
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.004692985339977893,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00037296907132779064
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.04565184907526892,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029236719566217255
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.033207332283082446,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0015253259864701307
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.02437466011761543,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0010326036760338344
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.04827889432422321,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002979117581191133
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.03635200073623344,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0016856668824093268
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.027066861347589673,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011944361879851397
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.18780593483984345,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.018823304784897736
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae79568a854198f3269927ffe4507645017a5a24
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.11799327511255576,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001719462692141953
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.11232330243321362,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0015217035094757553
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.10117935860269386,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0012163616286975524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.0069239119815885765,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.000623654699605598
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.005623588502170896,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003912742485234744
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.005230543547670169,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00035444811513678335
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.09540844175890396,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001400138877931801
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.09082951233546875,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0012149726294158937
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.08118840865208957,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00092302844984077
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.11397225810714907,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0016549375922209658
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.10865018897124848,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0014652624998135145
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.09771333398780462,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011609788706252934
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.46974098072284515,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03801240502570748
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ad33945174f72d541aadb6f5241defd4b47da96
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.20996717805826454,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0034007578731400064
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.1993937087135368,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002731695024599099
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.17027994304378627,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002076725188168782
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.051389185254084584,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001897799812976046
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.044041397040170155,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012928785613979953
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.03781539212782324,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010547868001683674
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.1650212857762001,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027728900367332667
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.1569078602081921,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021538351893500597
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.13232700263489727,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015446220080689499
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.19770727209848482,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003271890975763662
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.1868234597182294,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025620174566918074
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.15952778726250663,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001949864660470336
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.464357632363685,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.10634222940980814
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd9a937c7a7e674e56b8ec3413b866adae608a53
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.20782306336974588,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003989603045660709
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.16825057480875721,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029189489594619615
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.1507903796696916,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023330196000993448
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.0595807183516492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0023524175410708278
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.041168151935746036,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013717156421301187
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.03747403723815723,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011258395271525113
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.1665910745941347,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003339398193955642
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.13313952170536733,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023073953391433122
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.11863694423078541,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017954646430025558
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.19573487618267474,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038169172216808707
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.15732403928093558,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027338622985711727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.1410782911388888,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002189993147895174
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.1276225158037962,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09278611348359975
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d1732ab52bc50a9a5c1c74a7daead72a638ac48
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.07231401152182886,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031436302597989876
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.04905280746691137,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0021156075931983865
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.046502363487238116,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018245132184245493
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.021238504757177847,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015693705294511635
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.012651076337640541,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0008729050713848778
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.0121019455142586,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007543749415825447
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.05972427393508793,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027048881388111313
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.03901746728275879,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016833219673054504
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.03704860848718911,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014486764721672722
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.06781797173242858,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002977618733021878
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.045328572912779454,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0019558428897344673
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.04318243952400433,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0016973104965278527
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.035430233625646367,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.008458077459062699
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e07897b693ba982f6b563ffd565d3a89d74266b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.009674555582528548,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0012169889334882255
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.007085196538062005,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0008674068166182141
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.006660488491670164,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0007542427498051077
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.0031805595203384497,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007013139041818058
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.001691874973499856,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0002981439928898985
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.0017019468190412445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00029911355932309857
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.007534759253956328,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0009640139065180261
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.005673494013808289,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007048591996388539
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.005265213067519611,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006070437259395422
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.009159996330829265,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0011664337061398202
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.006641483198385802,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008151851711595139
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.006249297337617345,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.000708537597444003
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 3.413680523283268e-16,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 3.916096847388761e-14
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..69cbe498b0ed9feff0ae3df772493fa0807f50f1
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.14048666426016918,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0027632901821198767
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.20669383392760043,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003301837819297362
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.14941919223655403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023626814400315678
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.029525585854477905,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008710537995147101
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04703290588131188,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014209190224071058
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03335939346289185,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009250316805971055
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.10735133959204357,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002260925698512594
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15988100122471754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025788969320249258
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.11304744711913277,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017089859184516304
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.1306418700473533,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002629490470671884
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19167920541216937,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030766125540393364
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1383102708440391,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021868611093705326
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.2473591602494687,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09068336197346967
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..92937dd9bfd02fa46ca67b4cd1909042e542b46a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.20164190917278885,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00340245779140814
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.18321322169620108,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002763046996934259
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.1614637512336107,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002154838127118863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.047278865796894595,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0019439681128851241
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.03836443905737192,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001290609856100645
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03413869729774695,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010840564141073451
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.15753098406464713,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002833744217070429
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.14069990485042436,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021198152025286845
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.12354041881834915,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001615855049003565
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.1895150765616292,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032439901277073056
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.17138871604556788,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025668579918568207
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1510952282258089,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020029762719872084
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.161551083223167,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08752015895630673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c559f1f1940d611ccf212b5a6e1538f494294d36
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.30853356659765047,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003908986083294316
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.23611642752367193,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028098851869033524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.22448439980932258,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022164668755803446
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09422907936670104,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025861688595862934
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.0641343554561749,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001505883040485307
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06229028061393615,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013505011306650329
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.24269847651739596,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0033468376687347985
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.18285960289057043,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022296534964715052
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17363950691564434,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017583318279909019
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2890613207429283,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037637245904156287
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.22021341816156315,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026435827841586675
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.2094330802436139,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021028796887348792
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.6712694957538092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.12335434340064529
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2893198f46dfb03ef5f6e83b036a50391b0d748d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.28082836293366026,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004319205532930181
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1968690045661053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030792534914180656
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19386388642181623,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025825739138006965
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08943278692521785,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00260423039924173
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.057174167865699795,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001596148314592036
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.05709915142524781,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014322906346376607
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.2250992736689434,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036669609547972885
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15462182132992405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002462032347060667
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.15256193731747766,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002064934954063214
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2658819512511678,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004162301116768716
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.18550776729783153,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029178715298313596
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18280631870790667,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002455551930143459
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.8478598408877533,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07777205711444814
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba5d3826c3ef95b2a72e1e243e56d7050d6df021
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.09422577510372415,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0035020894909113
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.06252755874219122,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0023819303265762287
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06231197551494105,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002187598780650555
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.02901231903773154,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0017246417352838742
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.01773347814345909,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001029862057905482
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.01797607134766925,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009610791037997163
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07680762398075403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029646336358863275
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.0495501454165194,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018970706209682415
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.0496325144290003,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017538529763009563
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.08907486896592501,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033434435220432352
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.05871435609309791,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002238019081316999
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.05850840107044648,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020558492019394463
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.06105217041635524,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.010329575504157805
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f13ca1635c2f06e6debdbc93df30cc85be1fa71
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.01542645344389709,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015844345606781326
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.009780146620517932,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0010592344698244329
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.010082013794884772,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010101938209082168
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.00537468824664498,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008303851566942339
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.0029150772505705507,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00043016549798545706
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0031781048041287503,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00045646146249046847
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.012798829405832771,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013675936853478024
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.007665110474443101,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0008164388748390192
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.00807413792108403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0008184301651151722
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.014620295797040817,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015055626328950474
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.009149077393940025,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0009886149667351712
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009445400067942783,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009397610137449974
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 6.646244004651184e-16,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 6.685084737559834e-14
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..404ea88ff1dc6fd66064c4fed72153d7e01c6f64
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.1168570051133667,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019198426627741788
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.1481389494974355,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002193623326652998
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.11485230802775526,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0016046705097408755
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.013144492989594468,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005689151589046461
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.018392171320423697,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0008301672620875432
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.013633187300002049,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005587494363579251
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09620473485098723,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015771479521240449
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.12284031238534873,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0017612038827154002
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.0938871115571235,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011996389682465774
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11042358047739399,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018173803801964318
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.14011085746312696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002067141292175654
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.10823902153271642,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0014865854550978846
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.8119157520078589,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06804323722700163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9c041979495050615cc23ffe54cc23939060d84
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.11550809880471202,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015189235287772081
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.11401586371732765,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.001531122722495617
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.10155038790836553,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0011942054554629799
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.0054870610531079925,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00032521139602944963
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.0056622848349370675,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003699905567380415
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.004949458338597906,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0002943527715399227
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09245029143745188,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011784020884857262
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.091481469494611,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0011972745307006892
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08080820739114689,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000878736781073791
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11136411939666795,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001448526312362296
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.1102168441246706,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0014748044014163042
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.09799762905139109,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011402026800591278
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.38221868011066523,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03853483789988312
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a32adfdb5f1d3dc718b782e8858b1179ba4cc3a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.1457819637516262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002011906150918097
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.20956318422385695,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027712949082613473
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.15340882653832058,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018318475648937066
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.02271156846877315,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007950544243146088
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.035721592276879016,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012958816708578905
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.02440471180848951,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007647500431265382
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.1077615509646924,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001428107586317939
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.15720431590652129,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0020890517297875114
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.11306799842590186,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012328157824464986
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.137408970351226,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001890837273370008
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.1972394210829142,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025925068905230946
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.14426644390129503,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0016951248163021794
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.3895694754568146,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06986390519153292
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca05255bae177ee4b9d6f76d931bc13898952332
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.118327473789933,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0023211153395344495
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.1694788781476762,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002956629357723171
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.12171175588166115,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020194351189424006
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.02010129949676097,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009300686699073729
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.029441824320720608,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012058268535209634
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.020068262069926297,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007221000793102024
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.08953936172947453,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0017775315158264063
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.13029929545942714,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022970146784003155
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.09159725836407859,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014444402919229701
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11070722310951053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002166951419812563
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.1588130948616694,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027649492276086316
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.11379947696823768,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018742255655409391
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.3731463518427973,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08696587018606129
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..032f9fba78cdfa75554731b7c901f9b4d2d0fb8e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.03293908776302082,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016730427345075334
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.04519023931424098,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0021035039013836245
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.03210009752200412,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0014634312870140265
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.006510228955843905,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0006254896794491436
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.009066760076675555,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007714841572782599
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.006176815477796111,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005244170311147141
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.02522752422399141,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012818467423676987
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.03584434932056522,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016868429030500354
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.02466933947334243,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011019585144221695
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.03087353364690276,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001563351432924281
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.042420874912671644,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0019707727296120795
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.03008730837903098,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013711177420732026
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.06336544092366646,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.006198900880420144
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..693aa022ffd2399147695b2e86c597d566e4f319
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.0028814261031211595,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0005460823382341378
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.0042379170709383514,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0006322445256233903
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.002942517384632592,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0004530137775198365
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.0007022578331392375,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0003456033604074101
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.0006207799035616884,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00014464546198768093
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.0004737386445928847,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00011428924831347669
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.0020692242308215287,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00037071046415170156
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.0032284384153396534,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0004792314433997919
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.0021726977472457452,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000323151520869407
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.002690957058200923,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.000516432539920016
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.003973277793754405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0005917271674956313
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.002751073118371148,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.000420923652085459
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.2107162453936286e-16,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.3979566517499855e-15
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3d9e7a441ddec05b206c4e88a89c67413dff9dd
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014922019523732954
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014782913600996659
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1aa027c251fca42317c2563c95cf12e9c0a45f68
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01500870618212173
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014987482264363935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..53d32876f04a8b362391748cc88aaddeb394fc79
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.369,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015266698139154619
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.355,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015139491543780532
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3208569c86f81a947c776ad2ccef84e82b85bf7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.37,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01527525231651936
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.374,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01530876736900636
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e41d42d7302ba9d7a795e767e4ed7d426cfe9e61
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.354,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015129868238451773
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.354,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015129868238451773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ed71531ee1d0b5322adcebfb1505fb0cc5ca060
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.371,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015283736211823188
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.351,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015100563798316403
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1a2705a32bee4afd595fa98a61dae7cf3e59385
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014922019523732953
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.319,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014746404865473479
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf0c1e1244a82a56e82638f26444960c003d90b8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014888272588203934
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014899597242811482
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2df8c3cd8373f883d9dab841099a5e89a71f1a0e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.353,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.0151201726054837
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.353,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015120172605483696
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8863310d699c369b7eb0a4706b04c29d299d0aba
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.366,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01524061272640575
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.357,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015158521721486767
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fce467df43c47cfc2bbf0827bfa53fbec193fdc7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01497675877162034
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015050266127564438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a11a7e3e8b7fd8eb0c082b072761223b32f610f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01497675877162034
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015050266127564443
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc3abc16bc1cbd2fe30fd94909a9fa38859b8563
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014933117490932573
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229857
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..158594dc10770aefca34c3d4933014d6d3b184b5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b76b0ad4be89225fa1872995f13be65f61daa036
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.349,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.0150806639915631
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014734079309311901
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ada6855d81c942bb5d3e3f92abc3fbb402141ff
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.362,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015204840912919498
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014998131348402714
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1d13575b98f622008f71e432daa8d4a101339d7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.347,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015060472031706617
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014944140233795028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9ca541f05f27f332b7dba08b219b76222bf3830
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014696631960792511
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..85358138a255603ad37d4593f2a7b95ff3c55cb4
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014899597242811476
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014899597242811473
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3699413b830483e958af6d29ecf56a4b29c01f2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.349,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015080663991563102
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015070604603768408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9bb3b74e45b5669636ab489917c6fa7b6747aa3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014922019523732963
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014944140233795027
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2af44c203ae86c345ae8f9a9f49de16c2e883b00
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795021
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014922019523732968
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..33036b4c5a5ddb61f523bab75b99a572e7319ead
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014888272588203941
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014976758771620347
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..59d96fc93f629d639bc7aa2b70f3cd625dfbbb7e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014899597242811475
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014865395385928357
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..180b91ea8519fabc72424995e6de3ba8de15e144
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015070604603768408
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229857
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a7943b7ac650633a96b084c4ce68c32ca2ed007
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bf51495deb620f950d21c7c8669e9ffda88c47a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.358,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015167928865407559
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01500870618212173
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d6094e1414c3e02d0c30b41d2da4463297a9e7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.358,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015167928865407559
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.35,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015090650341444233
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..16ecc767e6ff7f643901e9628ff6e4517c6e93fd
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01492201952373296
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014944140233795028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fbd5a6b34169bf4453119e1bd8c06d542e154c5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014734079309311901
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014806864733738856
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c322ed80cc51dbf1f19d19c08fa05a3f068ffe1
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014944140233795021
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.373,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015300493622922809
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..420af407080b1528abc66c0bf29bca291b29feb8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014922019523732954
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014865395385928367
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..39b9a7d9bc4be37498e578879f48faf25d5fd079
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01489959724281149
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014876872027456732
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..08aed5a69604abe2b3674923732d33c444ceeed9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014734079309311901
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014818724459095526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8402cc6ee73c9568ff519a66b22ec85a856b558
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014818724459095526
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0729eeda2f5568f8f1e6fb7044f6ffe91f2ccfd7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014782913600996666
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01484221315341125
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1e525433c56ecbb7850e83ae6efde123d0e013f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014910846164229871
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.313,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014671272822977883
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cae7a44c42004f35b76ef4c36c9e4e5a4bc41f1b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.313,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014671272822977885
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..de00699ddda729b000c0b5bfb3c5399e9fb136ec
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014758652303574886
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.313,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014671272822977886
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fc7112ab2a32edafb3aa397d9dba93cf84f09e2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.299,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014484778521220468
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.304,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014553205687950432
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f00d278922b3ff4430a7f02f6a587e1f4c86011d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.305,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.0145666463946644
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.307,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014593284892852625
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5998f4d7d45066712993969b6406349346ef1534
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.305,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014566646394664401
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014709193056057111
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..99f11fdce2bf77b2af2980149021b08925078ada
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.309,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014619600977206491
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01491084616422987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a7afadda3796ee52860cfb6fbc80251bca1d765
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..846f9fe003ac71779b4a9799019c806566e14ecc
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.303,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014539683710535233
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.314,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014683991951087987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5c294f7d303d0a75b2ebc2cc97cf38c5b1feda4
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.319,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014746404865473477
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01477082181793465
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bc6c5ce75dbea28aeacc5b72b83e6c7aafa800d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.301,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014512395033543157
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.31,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014632638658632891
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c609a88c8b4e80504583f71396a7fc3f061aeaa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.319,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014746404865473482
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.307,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01459328489285263
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ced47d390e759ba54d0c4e9883bef1fbd0c7a3d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014899597242811483
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014888272588203933
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d36e5433b8d5cf0b83a980697476a9b46ca5b725
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.297,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.0144568322948011
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.302,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01452608023545954
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..54aa830be974ac608f3895f3ddbae6881d4a093a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014933117490932579
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014910846164229875
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fbca67593ae3e2b17500b3b9021a910b2aa83a3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01491084616422987
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014865395385928364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0a5f4e904bea90bc3e52a0477fae62a4f22f148
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01485384248727033
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01485384248727033
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f94d56bd00f0d5d48e2744da22de108ca0a74561
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014842213153411245
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014865395385928364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..041a85921760a58ac05ece1936a7778f21bb9c19
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.314,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014683991951087955
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01491084616422987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6c82364aa51bc30c9413a4fcea205ee14fae6cc
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b1e7a7c6e7fde49a7f66d60f1bd236b7aad8be2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.311,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014645596385722688
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.308,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014606483127342756
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..024f46b846f68947702ab0a63004f91fd3233dfe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01470919305605713
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7033004c79fec2857b80df85ffd3afaf479a91d6
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.299,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01448477852122046
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.314,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014683991951087976
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..08c297ffdfbdbfb9bce201286faf5cb934c1e2f5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014709193056057127
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.309,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014619600977206488
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..217d6f2ad613572b99117003240c09dc2cfaf073
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013680495725767785
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.35083333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013782212417178192
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd6279be6cb8ce25e9f7460293202f7dbad01664
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406396
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.0136476029424064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3706987a41e953721c26fef98d04229511cb5742
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406387
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01360541734571053
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..62807a024e5bc78dc241ed1f60a59604b1e7462b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3475,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013751753243291854
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3525,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013797164918918355
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..82f022d03237f01eb85615bfbfd1f0d5a259026f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3433333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01371263383046586
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.35083333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013782212417178199
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..30f112133b1f2acd13a9ec46818d089145141c98
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3458333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013736245342311014
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3425,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013704669762934732
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a05ccf5044cbcf01cd9e8fe1020a7ce7ff7fa302
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013655897185463665
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.31583333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013424568830356446
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80bcedcc8f78cf2252badc77b6836e91a4cb8853
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013639261190932889
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013655897185463653
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b18ed2d9e49a26e46f74fb7ed2b3f2edb19b5ceb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3258333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013535422043417459
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013570806258433628
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..53d218d0835015e1f876cc5ba2c4df1d37f756ad
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.31833333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013452948996996296
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013508372867300217
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0924a20a95e0fa90d1b63ef8e5d47222261b5ea0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.31583333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01342456883035645
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.31916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013462309712005134
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0da85f8be71697450d54463951f830902b0a7038
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3125,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013386029277441229
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.31333333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013395739415639082
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d134e97eaafad54587495af6f239da1f3215844a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01357953127780092
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01363087184382147
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1dfdda7f2d28e815b6536b6b5b36e202e782576
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..176883d90d21fa363ae0b33b5d54262d16f9e579
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32083333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013480882752851552
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.31583333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013424568830356448
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..43d6ee0cd9ccadbfbc180691890a9621f0299137
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013696658778002515
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebc4363213300086822568aaa075b8b14b592852
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013570806258433623
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3475,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013751753243291852
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b325500c6fcb4e740c2228d60068571d37d93c9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013570806258433621
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3258333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013535422043417455
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d68834e0b9630ac5b23db894ec92c0c64fc9e6e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3258333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013535422043417466
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.31916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013462309712005124
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..44ffb7184ec66070e19c19d2d079960650785c2e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013639261190932886
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013696658778002517
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..42e50b38a303c3f0884efecc5e68d241a36ddc8e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013562032919529019
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881634
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..38b0a770f76f8cfdde4eda1b64ba989a834d0908
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.33916666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013672343491681819
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.3383333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01366414400661827
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f652841adcdfc68e8af9b51b542ae36ded970d53
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01360541734571053
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..746e4971ab0b43758d8591d753ffd7f518d1f575
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.33166666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013596836729485163
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33166666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01359683672948516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6afa678afe81e684cc02942075cd5f5bf36a623
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.31083333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013366457845965442
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01363087184382147
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..61db2293eb14940425ae73478abfd1543eb52b25
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..45f7afe51fc15d33b7b83d2d3e458c9e07a2969d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013562032919529019
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881634
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..35e2b46dbf581a67318fe15776c39483bba5bfa0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013696658778002514
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01351743812088163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6bb725edf93ae2a89e612a1dfc07647ef3cbc992
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3225,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013499258621103245
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..870d7d500bbb79e8b7fc51b337929bc67be318e0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.31833333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013452948996996296
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3258333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013535422043417457
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..051e0e012c565795d57f0f81d7cbb3ec65990986
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24061433447098976,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01249146853239058
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24061433447098976,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01249146853239058
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1b644593b0fe437b22a126de00f50517677688e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01261035266329267
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01261035266329267
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..266d022c2bd7e6f549310737b3fc7c59932b4665
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012414960524301836
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012414960524301836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b145e6b14be733a81a27f7b85b29df84ba6410b0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25170648464163825,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012682496334042967
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25170648464163825,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012682496334042967
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..78bf5bad6908243f29b83a06c4c0c71c59ca9361
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23720136518771331,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012430399829260861
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23720136518771331,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012430399829260861
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..193de92b3f104f48797301882009376ffcab197e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.22610921501706485,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012224202097063257
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.22610921501706485,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012224202097063257
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eaf8c3a64f6c5ad6ded67a2652c2f62c75def726
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.27303754266211605,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.013019332762635736
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.302901023890785,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013428241573185347
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..38c9f896fd2ab8ecf361181011975c32f34beb02
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.26791808873720135,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01294203019513641
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2960750853242321,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013340916085246266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f6a3b2c588e8e5ca9c780c99556d8446f9ad112
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2568259385665529,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012766923794116801
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.29266211604095566,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013295916103619411
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4d0aa4b7a4bddd00f0f0fc894391842f4c2f99a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.24232081911262798,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01252159329580012
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2815699658703072,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013143376735009009
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..dec45b50f10a2cda2d197d70496b0be08790e6d4
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2525597269624573,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012696728980207708
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.28242320819112626,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013155456884097218
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3df2c1ad00a059fdd794a99847e2cf831779093d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2508532423208191,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012668198621315428
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2841296928327645,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013179442447653887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..460901a390df697c9d4eb1b1ccaa4b7111001128
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01261035266329267
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.26535836177474403,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012902554762313967
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..386565c3425b0950b8ed789e1f0884495bf29956
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.23720136518771331,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012430399829260842
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24232081911262798,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012521593295800116
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3f042d50be8ee21003934a60613f2a6109f2f6d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2226962457337884,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012158314774829926
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24573378839590443,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012581033453730113
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e817a4eec16a653de89f0832921dd80a186a36a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.21331058020477817,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.011970971742326334
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2235494880546075,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012174896631202605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..32fa183de8eaaecc3862bd981b4bf55460d19fc4
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.22098976109215018,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012124929206818258
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.0122728535825408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..df1b74cc3f8b7a8fcf306e69fa0eea6a5350adf3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.22610921501706485,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012224202097063276
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.23464163822525597,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012383873560768673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fcea98ac73e8aa77c04160e9bac2b4254b079004
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24232081911262798,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012521593295800115
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24232081911262798,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012521593295800115
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b45382982d96881c82d9c454131aedce2ba2f88c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01230492841874761
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01230492841874761
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..deeb661076f85feaa92fe7b79a78d98c096a6199
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24573378839590443,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012581033453730114
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24573378839590443,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012581033453730114
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..29143db361191c7ce31ab59a25f6dadca534b2d8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24914675767918087,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012639407111926427
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24914675767918087,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012639407111926427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee78ce1674b1defdf86505af625842c9e5875cfc
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2440273037542662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01255144762785626
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2440273037542662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01255144762785626
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..255502f9e015ad749dce9b6c200c15c46c8cf82b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24488054607508533,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012566273985131353
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24488054607508533,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012566273985131353
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..60ebdb9be42861df59bfa1d1257aa64920195df7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.25853242320819114,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012794553754288672
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30119453924914674,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013406741767847619
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c186414f755843322dba89576928aaa5bae7957f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2636518771331058,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012875929151297063
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29266211604095566,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013295916103619413
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1be826b423256229283ef5b1d4ca16bfbeb0dc65
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012653835621466646
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2901023890784983,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013261573677520778
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae6dd7a826e2a4f44cd40549aec3967ac4d3ed3f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.24573378839590443,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012581033453730106
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2883959044368601,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013238394422428157
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0911474d51a653694946d7d39f6e4fab39fd3de2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.26109215017064846,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012835523909473855
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.28498293515358364,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01319134817983879
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b278fb5b1fc6411079b5f03999a30f501e59d8ea
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2568259385665529,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012766923794116801
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.28924914675767915,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013250012579393443
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1857c0e5b74f580d6354ca82e04b890bfbf7ec2b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008885233166386385
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008885233166386385
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c092652ee0e75b082afe2993dbda7f9b237e3759
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23695286195286194,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00872518926147229
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23695286195286194,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00872518926147229
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..572d32c0cd70bbc27485cc40d62214aea04e7de5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24705387205387205,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008850055161459239
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24705387205387205,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008850055161459239
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5304631704ffbf305ba423eb35b6c4484dfa031c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24705387205387205,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00885005516145924
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24705387205387205,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00885005516145924
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..48f6ac33cc4c5286d6bc69fc4a6d0217e7e255ad
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23779461279461278,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008735850753507992
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23779461279461278,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008735850753507992
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bdb89edc6e4ac7090b510069c2109a14635120e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2516835016835017,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008905088235948768
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2516835016835017,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008905088235948768
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a89abb38284c7351af2240b125c22f21b1e1e23
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3531144781144781,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009807078935467608
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30723905723905726,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009466688832475374
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..27f88253b43e59d77d965398a2c5c05080600a90
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.31397306397306396,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009523245335215511
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2967171717171717,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009373559492986842
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5745c66159b11bf26581fc9aaa1fb6b2add2d971
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.30976430976430974,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009488172851903717
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2904040404040404,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009314833302936282
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0828a5c6510009a5429b4adc8095a94683f0b186
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.30934343434343436,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009484615220606826
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.28914141414141414,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009302827114597425
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..60091f8efed0f9ba75e30d731c3dd925a3d693f5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.30176767676767674,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009418994158522525
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.28535353535353536,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009266280584997755
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6587a9fdfe8186c00b0c094b0e39a9dcd433627
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2984006734006734,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009388855914040432
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.28745791245791247,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009286682281593418
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7befb24b97a79739e30b0d227dae44dd4ba3a9e1
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.281986531986532,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009233124071053648
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.26725589225589225,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009080463246017469
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c00bf118b41ab3a12072f8de6adef34f9e065b5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2756734006734007,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009169229476542563
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.27441077441077444,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009156177122244525
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7102edb3b5c9c3b81f6170813cc3d207c172145b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.28914141414141414,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009302827114597428
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.28619528619528617,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009274470774627718
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4939e59a6a6badef2141ba0ff7a0c70122d5f34
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2824074074074074,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009237303403479344
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2781986531986532,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009195059601583901
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a34d07726078738480e80c8a43c4c74ecc3e083
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2748316498316498,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009160538115254961
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2769360269360269,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009182190173795889
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..14323ca4260771d101a9512d6267b9eda1692684
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2697811447811448,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009107527914671064
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2697811447811448,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009107527914671064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a798970a1096f7703fc8a92444fd8f2083ba8755
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008910024163218197
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008910024163218197
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b740be1608fcac90e2724ffe16a47420d7ca6b7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23526936026936027,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00870372426971864
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23526936026936027,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00870372426971864
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5722c35b68b94df8c2f32cf2f5614779cbd2634
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24831649831649832,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008865199020660961
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24831649831649832,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008865199020660961
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd74fc0c06fba94fec0ff42637642d480aa1bf46
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2542087542087542,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00893453768114154
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2542087542087542,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00893453768114154
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b054997252c0954f0f89d4ea20e07af0d160a6e7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24831649831649832,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008865199020660961
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24831649831649832,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008865199020660961
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c5140e812853962b45262abf51033d42ad44539
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008844984581934907
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008844984581934907
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..60094c1b3c84a69f14202cbff2ba8328cf16eae3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3383838383838384,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009709034670525097
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30723905723905726,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009466688832475374
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c93052046cfab340ffe41dcb492322d5aa4cb6b0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.31565656565656564,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009537019245566084
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29292929292929293,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009338583737393599
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc95f937d6a4429e73d569385c14fc4e0b7c9540
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.31186868686868685,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009505823345817652
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2878787878787879,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009290733161670164
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..318b96db65f4692bea06ce91fbae3b14782421b5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.29545454545454547,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009361987126556453
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2786195286195286,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009199329195026352
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..572b1f214591c1a970c51adf5766bd980dccc096
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2904040404040404,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009314833302936285
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2824074074074074,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009237303403479329
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..96438e304da59d20bf0f66062c98f39a216dc503
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_arc_easy_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2908249158249158,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009318815921176655
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2849326599326599,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009262170695590658
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e289c3470a82520762aa4a58c0a4c383b1074f8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5386666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009102888762598247
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6063333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008921375326707084
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fbbfef5fad1d6530ccd5ce4eb3c7cc9858c58d9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.538,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009103824830376472
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.588,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008987709736566395
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd89aa89632753e89e011112b2e5ec025b26777a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.541,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099483512819306
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5856666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008995223478188031
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb4a637c4be161b015bab8e8beee5f65d20195da
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.541,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099483512819305
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5823333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00900559683375783
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bf42519e1b659e6e962381126d5a20527c978ef
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009100967487199723
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.568,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009045400659508363
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d65577359caffb523f317e2ab7e8f2d292759d44
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5223333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009121118663627248
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5736666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009030591966818142
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ab5af2cc4324a26042651fcaca7c37cbba1d70a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.59,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008981103499757514
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d465ddfb7777b6aff1c50542264610c8c97a0456
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5713333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009036836097555083
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5653333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009051951785603835
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e39b52eb16a1e47cd4718acb8aaedf500ebf40e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.578,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009018450207660424
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5713333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009036836097555083
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dc3925ccf25aa96bb166753aaed458e82b24480
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6033333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008933122315228996
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5963333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00895916952266258
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..524878b07017cceb9fbcd8b421233bd9e3b1ce18
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6066666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008920048383377188
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.6,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008945762994765773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4ddac02310ad7c06375ec39f35864adfcfcc36f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_after_reading_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6103333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008905164372580984
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.6023333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00893695992571691
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ad385150d3199c53f57e84dae8d708ab5b60c6b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.6233333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008848110494114768
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8670a53913851f5c54851ff9abe0d6d9d443796
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.609,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008910637827273029
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.601,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008942016171856502
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..43c9dfbfdb86a4d70bea4e5e49b1162f9f4ee311
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6073333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00891738144014832
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.6053333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00892533006683219
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e736471e1e4ee0344407ade1ea23fd00ec891f38
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6056666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008924016166504414
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.607,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008918717088507559
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab09073f6a449aa4fc34a1febde2aa4cb4f17287
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6133333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008892593055774285
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.606,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008922697920438162
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf9eaa146980d5e8b504efaca145f8d79973d066
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_exercise_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.615,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00888545536850563
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.6103333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008905164372580987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9eed6032130b077b5c175996f481af3d03957ceb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5273333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009116578321738462
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.39966666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008944518370322185
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c51c9d649679b725db8c2fdb58973ebc3d68f8f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5763333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.0090232041691723
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.573,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00903239695383109
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dc63cd804cac43a2562812ef6714191376f9256
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5823333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009005596833757831
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.57,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00904031207504128
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed23e16e088cbc7dc6fa7619f2630c24d085abb4
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5993333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008948239303079452
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5976666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008954354670397112
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d181b5942d3cedeefed72cd70316542002f7f1af
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.6056666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008924016166504413
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.603,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008934405848700122
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..601a17b9f269444a75cffc84e19ee18efc04c34a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_valid_binary_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.6106666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00890378508047089
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.6066666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008920048383377182
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..76c520438f41e0b60712dd02fdc6decb5f9e3484
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008846558976258922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..20525601e37d5e08d3fa3e0ffa8b9cd655b3e220
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099982269204863
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099982269204863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ceb65f0b4683b7d0c119866fc8f6dd776b3552e2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5943333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008966262991425923
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.595,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00896391565823638
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e9a5f80bbe8dab401c8cb2b5e872b38c2e16a05
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6113333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008901013367923425
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6116666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00889962094339769
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..19480f3508743299da0d699860e34223d7f80990
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6176666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00887380602276318
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.62,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008863380835773165
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f559cf394fc9d35a6184407f92bd3da2672554b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_boolq_yes_no_question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.611,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008902401412932073
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.617,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00887674483503322
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..41cd9a64c0692754778e14e521418fd6f1d1f639
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813057
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.23599320882852293,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc9f84402f5899bbc20c00c226b1eaad38723a27
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.35714285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0646095738380922
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.2563323201621074,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..59f7e57bee835d52831dbe6adfe8854c77ff284c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813058
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.28166858017604285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bf555d54cd6addb1c6ed263c8e4f525f87b61e0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.20038220038220037,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e4b167d39e74f2a453f419b56aece97bcb02048
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.18421052631578946,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..218240691d638b3064baa43261165746e5d1460a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_GPT-3-style_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.35714285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06460957383809221
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.17777777777777778,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2698499dcb3ecbf547b5954354e1cf7ae34bb93
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359538
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.1940928270042194,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..44851e3ef82ce814bf15bfcc93378bf7eee38bef
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bd712569748a1248469c0d5051c12c4c0a49557
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.3481187642745522,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fc5c88e6e7a8d66684272071027cc4e6ff2c9f6
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813058
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2515873015873016,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..350db84514a3772927135bfd8a09ba83fe6d53ca
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2522366522366522,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..14c6509ed8ce6eb47f5950232a9f84cc9adf3208
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.23566182215971246,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..46d7ee5e39bad8f31144868218d06cc8c335afb6
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.25,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.058387420812114225
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.20014245014245013,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d7bd5231b7911570b9631b609d88b4e0a57550e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..179b5d8be9b09517fbe769f1e4d0ac5a522ac634
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2642526964560863,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb0bdd8ee38caee3e330dbdf1e033fb014989be1
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0663363415035954
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2858641489640703,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..990fa7e0999e5f5a245f32c9d67b1c21b1b36f55
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.3392857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06384226561930825
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2222222222222222,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f01ee73bd32132f3a0c25b6d3cbe52190503a418
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_can-we-infer_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06585388898066351
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.26157407407407407,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d26b35fa3242e4f101380d12118b83c54b7e697
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.30357142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06199938655510753
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.25353535353535356,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c74db5bb421eee4370de629035b85fcef4c23504
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.33654945683247567,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..07d7f14ed1b6774c82d579f3a9479af59461f823
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.30357142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06199938655510754
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.2959792578695018,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0dd164460a1a83c2ad45e64227175e8e03e174f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.17857142857142858,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.05164277182008721
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.1712979526933015,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..06a62f2f310771b82a5f47406e65be95e9c7ca7c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.19642857142857142,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.05357142857142859
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.18839196978731862,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e0c172860e3c582b1d0d5cf8dd6b1e3ac1637ea
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.23214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.05692939024000109
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.225,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ac11603c4df957172bfa5c1cd8d93ae2d91b02f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.21428571428571427,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.055328333517248834
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.19458615016659064,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8511ba3d75bcf79ff4b371d8d8467f47edbe8dd8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4244f036f4fec179f276c469a324b4a18e110e38
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2833685198217218,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5721e7b61bfe38af99e1d316a61b5f04f6ec60b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813058
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2960755091902633,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0de023822e2c85cc2c7c9867a5ef899adfac696d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2565284178187404,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b27f27413dfe4a7fd05fd98afc9cd17ef4bfd862
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_cb_justified-in-saying_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06585388898066351
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2676134781397939,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..83c12a0b357c5470649b4342f8827e8db6c41b4c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050161355804659205
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..871477074eb29a105231850df6803c95d007bad1
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0dbecddcaacdd98c99b684effc74e61d20b7604
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c742b4c0f4ab6c376b7fafac28cc18a56a4526a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.6,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.59,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e7fb24ce74b6cd1ba09d99783d624339942bf1a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04988876515698589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e4019b57902f4fadb9505ac259988067e40914a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_best_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.6,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.59,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237101
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..055fe3020c66f4a8401b4f99486efedc684b859a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.57,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04975698519562428
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5c7b82cc202b3846bfbd4398c735a71bddfe99f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04999999999999999
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0e3948e9c18536c1b2abafe4fba543925ea53e9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050251890762960605
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..21d2d99e4960cb7a4d53700a7e87d99a7747da5e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04975698519562428
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bbedae845db44d044cf673e1ce7f8315620553e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049431107042371025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..82d2d127abbb4263607dbfb8dcbdfa31da6b2292
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_cause_effect_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..934e7f075880ab44373582761af4950f51651e49
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4f3d76ee720ce4314382af01078955624b9691d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04975698519562428
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..63ddfb2d1ad4bc1d55a9d38762b053bdc28d7ab8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.38,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.048783173121456316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9399ca7d3918f1442a84a535275189bccc21e48f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04975698519562428
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1cda3d787bf5f75cf666c42272953c585a5c996
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04988876515698589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a2dab584876e7ad923dd888a314a79da2a4a270
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_choose_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049431107042371025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..57321421eca5b1e587c533b53bdb3b98e25eaae9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..334f5082e4f0903fe2a0af3ce0be1ac9339b67a8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..505ab2273ddd56bbe8eb1ec5abed567195158b46
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..857f11e03dd2bac047dbd70cb4a19a49580aeced
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ec24a147da47485cb35b62116b5eaaaf6da09e2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dc110faad7cd4a87487de8dfbd7d50e5530e07d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04975698519562428
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..655493762b64e744996ae3a630078cab0d10a3bb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.55,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050161355804659205
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d40dd7fdc6bb4bd03fcd8c89d94e6262f6761c2d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04943110704237102
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a6e7320b80df27538e49b2ab331afc8f8e2bc58
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..76aace0717121bb20465b5cb41510e035d9e0eb5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049431107042371025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec5977fa5cbfbbe57fbbdb1a5a02af108b0217bf
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1adb3e9f62946922a82b4cb895ac8f7bf6bd2edb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..843a9a852a42fadedf316cca5171224e7ee32b58
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 1.6981740230274491,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.05639581747200235
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.18461775373723568,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0026005533202261635
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.2709421058746983,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002901032838963527
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.21306073373162734,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00260469936196386
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.05762170363834695,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00144636757600207
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.07665802406321871,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0019164632387191742
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.0639746559185916,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0015730884231270822
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.15100987885580808,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.001847913237566943
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.2275598490119157,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002218584984658347
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.17623125609468054,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018965000242015396
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.15227707203191942,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020577913588062102
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.2249114067227135,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0023015700813454336
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.17615894675748597,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020576900261863084
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9886eb0e6f349cf7eb7ad129c8196a2545504cc
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 9.96698641896448,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1990330923860415
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.4609965771337017,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004586076601095137
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.37987535167878217,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003962232318603492
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.3948345170666501,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036620162579725215
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.21903816752228814,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002976279745126517
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.17782940012286022,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024379419479102857
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.184899842771481,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0023385739986275713
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.33718394432768894,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0037312597707337756
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.27549038942825305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003082991317914591
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.2868315090359682,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0028875362371707236
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.37936486415476367,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0040948721855691185
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3106003498059364,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003426475280011305
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.3235575895067646,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003224417319481092
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3ddc22fd056c8bddee5985474bed0a1415a52da
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 13.854353887594359,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17297348749849764
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5636886053242238,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003365909868631314
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4600628012976699,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029921086682403184
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4817650325322006,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023949981341780365
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.28301589520613757,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027664917362106947
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.2276135952020421,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022770165359506478
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.23835835323959847,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021017748917656984
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4181431608727368,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030295159480321168
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.33948065113070464,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025328414029523382
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3558600551401343,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021673932342781833
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.47016389392378133,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032388215431738685
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3828571201068127,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027788968245477737
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.4012330801491776,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023666952140306693
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5362370e99866d4bb9b937302af5fd971ae57ab
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.583942693396294,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.07837319643131219
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5820358474077353,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032331888050979754
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4693874389716453,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028872817993720653
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4949134231803726,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002236475337599633
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.29841477725265647,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002756754774322472
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.23792651366449613,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023167538990293366
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2504276350046371,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002099307370130092
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4315178078770954,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029957554213462813
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.34591473020366464,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002476997441911286
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.36516427629163034,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021003983397447686
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4875277711044115,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003185953377452879
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.39272527260887446,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002758966209609139
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.4141653781843093,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023176252522451174
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..05d216b2fd7fa37aca45d5f82ee8d90c2c98550e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.709300883445545,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.13909289849143106
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5842200644947523,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032415680641664523
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.46707140188114915,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028634798993105617
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4953743872583611,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002262323452189699
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.3010808279506765,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027909138698240248
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.23765379325641803,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002313565104143807
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2520751492512158,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021451907698192933
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4338235689405383,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030183502388625087
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3450632042572478,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002512770187116409
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3663278082137279,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002169786461353247
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.49060403536248004,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003216596887763901
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.39198654946334316,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002776695156562603
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.4158359809120709,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002376981852382139
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b488c17bfb1529762c082967668c70977b77a23
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.508027202695663,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19729885956116464
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5827447003386718,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003252743704799596
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.46479474176199786,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002779950681422667
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4939375271803598,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022129580308727064
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.2992451000703914,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027706344768827404
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.2350933451860936,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002225186354361509
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2500737308803256,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020834454268491196
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4330121131753695,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002975630807020434
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.34399062086892157,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002433479099663889
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.36577122165110715,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002101489877024906
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4903224173893306,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003175989510755157
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.39107749287557136,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026928788465534173
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.41541912166430456,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022884279022857587
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b27ed4945a14ecd02a274b64be7ec403a0b16ef3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 3.05988651777403,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0810296453144049
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.20519732124109513,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.00270006442645752
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.3468161863236788,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035404054532813803
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.24909819431786423,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002893624823392621
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.07031878918831437,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014268262802144133
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.11439603360394333,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022184263434388354
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.08417727846653551,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0016451293157964974
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.1598309675588018,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0017331961057331041
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.2787944514418254,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024559130158891516
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.19628984368933966,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018722658911640626
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.17891503505698214,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002513816210748885
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.29862293188810446,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032327191084023034
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.2161607467623904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0026884533677181723
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e56f7f6ad24c93de515337fdf7250dd0b6caf07f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 10.006682639472844,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17702306891001293
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.4789153736629139,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.00448976955675989
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.3830905201721848,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0038258847063219895
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.40374325026432717,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0035397356280531637
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2271866148988451,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029128947832580934
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.17899394069939906,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002365623843266929
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.18891534472001092,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022875763676466687
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.34845116104667717,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036371332440899083
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.27628825839114474,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0029710867029376733
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.291888121149291,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0027977041936955364
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.391979289276976,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003999654789641658
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3119866289090268,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003317127496333722
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.3294523831273175,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003132967917625185
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b55c45cd3a8c8076f8107e27070be216a3fe9109
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 13.533351780962887,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.21996557214666623
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5637517388847328,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033693539461132804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.45574622052482294,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029871424357699977
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.47993566559272743,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002412869717997041
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.28114460801866314,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027675305939149214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.22386112791717722,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002265557627345361
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.23583645689850805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021064135619051727
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4150907072454883,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003041198860592472
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.33333794137558537,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002521898919137688
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.35155736822685685,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021818967091197677
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.46692814202061905,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003234906975148213
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3767319992756395,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027905499560689166
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.3968966833492359,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002390947561471317
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d38dfc011ce14017f5dc2b6be666fe20e1984c9a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.392809420394265,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12682420333047015
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5761697396711711,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032085414101804934
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.46910207675338633,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002885851272557091
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.4936453397657041,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002268512440380754
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2908760217107419,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027504762505249206
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.23472065889957064,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002338453487460064
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2465503719143272,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021497257737069504
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4238930690015189,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029754914331797584
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.34332965676676525,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024918547413801777
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3615814515575109,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002132135118187528
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4791873227267321,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003178341414436254
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3893636074797145,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027438228796168866
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.4100151570685983,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002345807247704266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..afe72cae5791f8dcf9a7c4b14566441a12606411
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.793650168308517,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10622251877201895
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5749775266346093,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031835883766827408
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.47259329496156155,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028220142954102326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.49601283925081735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022190615188824253
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.29266983484455455,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002724658955172727
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.23756368274974432,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00228326458262002
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.24939935966748733,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021121154249644186
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.42468886201366246,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002911495939193518
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.34812073234739377,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.00247643056659921
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3653771997217671,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002109257136196024
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4796086565780972,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031243508690243214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3940103192114069,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002715853863926917
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.41357826323193164,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023087953273462436
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce1ab6fc088189e277a25f18bd61ec15b7304688
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.845898823514458,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.21800724630678245
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5769246962013296,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031674714789666314
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.47417689233706106,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002773164930669781
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.49837978991033016,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002207597586987178
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2946768385756233,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002744759884705772
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.238907636637235,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002260409516558731
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.25132933058908946,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021206561166708855
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4261285955236887,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028653910956793834
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3494535705791258,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024239805462257812
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.36737081391649246,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002093391008970402
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.48266010526884195,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031168143988746008
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.396635855622698,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026943167663674044
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.41688142252277666,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002316312633300744
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a679c35d7f5ac9e4e9664938d63eded408cdf90
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 0.43088392670710296,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.03961518900803983
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.12074617507907323,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029749213887964613
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.05686771392013282,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0011647167758256662
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.07032991515919271,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00136157455736835
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.030789282891283725,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001109990652325929
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.02001834529014065,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006990709895113358
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.023394283471936717,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008044627503034823
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.11414146607161421,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028337136738440204
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.05338660372411939,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0010814515894240666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.06604473509926488,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001258776154468613
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.11860380990067865,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002956819438605847
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.055427618982087014,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0011338872506724367
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.0686478664349337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013288862391063107
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..377b1ecac0bacb910e1aa4dedad1276b925368fb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 4.2489438945007505,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10848523635627934
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.2978880326400029,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004314734059526238
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.20574103567352484,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003452156469381813
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.21799479871592295,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003169109132023807
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.13974765472529488,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002984069627406084
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.0902480509784589,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001917484854887988
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.0954256009916435,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0018233170834364776
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.2537181223201809,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0037112591599500954
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.1665688306872857,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002559263851460158
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.17908060542609147,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0023465508632497206
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.25974058488564317,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0039990036630274265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.1732089631652664,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00290503437818348
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.18500301101744382,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0027026113593496136
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..57807e09e08585006cf381634ae22e8a78fa5857
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 7.897479428851062,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19090150792663044
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.4097753151220947,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004759830299635232
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.3050743649751446,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0041553698725291955
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.31691617191669696,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038078275455770048
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.20566849161886339,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0034162536009452013
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.1459179094411777,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002514675912366735
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.15116199103567896,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002362966831815902
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.32535955342181233,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003997229419498239
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2316744800368858,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0030801010270189366
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.24261367518705662,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0027886284095646483
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.35123874989098963,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004372358007688104
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.2548999077417509,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003549861311933047
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.2659220289237627,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003275917207172722
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d77429a396d325d409a4a7b7459935da03524e49
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 9.864961497979543,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1317560469306074
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.4501732457506971,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004734510768497693
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.35400553421574077,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004181631727240358
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.367082544858982,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038803792544539776
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.22465174511925462,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0033633484029790346
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.17211207944695756,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002640032098747926
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.1775904737014302,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0024963206617874046
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.34368601437901597,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0038108212641332403
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2642268452145586,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003154585439683898
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.27445570312431083,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0028600187937791646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.3787563012175693,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004249607492303315
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.2946889001020606,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0036226116144742046
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.3057169456441104,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0033589708748168147
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b12920263b68d235440d4563496cc48d834e193
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 11.504942043363393,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.26131157019245693
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.4793166897526887,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004447515176857311
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.3926779566427441,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00396929495072039
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.4066062429397127,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036602506735932336
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.23917119853723076,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0031916812958254622
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.19316318038199604,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002618076681843197
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.19940880805541641,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002496228569517145
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.359407103102224,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003589169466666821
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2902605810383062,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003028323235467514
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.3010015496850165,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00277134061669236
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.40309118710355224,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004055402595432995
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.32783184069905075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003485623775598431
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.33985627414169595,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003253456875343155
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd6865bbe2ed4ccf9046eb01f429fa669553e46b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 12.37094125306445,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.26637203939148857
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.5019495860499606,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004233425201646585
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.4148487000765162,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003701189400722239
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.43197062064083586,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0034335264670076996
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.24964365910833303,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029949042783772935
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.20417244071827118,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002504613430914506
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.21239127308248862,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0024116062520429865
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.3697094271111249,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003357782592019311
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.30372307503085555,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0028621696126892044
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.31619868839227344,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026432709715011175
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.4179517060062675,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038415175450192012
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.344453227089507,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0033031188522087873
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.3587152017326737,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003099503498187738
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d12f8054ae0d4485b0a351760f52a97cd0035ee
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 0.23009457158327395,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.032672420546843176
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.046037847651470024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002049984498068587
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.034901976047314216,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0015828043496248269
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.03565768533846172,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015248066303431285
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.016516540815237794,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011075729579088504
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.012463074267663072,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007148960008422395
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.012939399202474111,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007270998548969809
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.040481693666315224,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0018321004056548362
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.03073851262251656,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.001408552293064241
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.03123228667584005,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013412818998006393
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.0451064653533249,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020199102881650523
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.033817754277814645,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0015239128770440768
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.03473298460911775,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0014863724750131808
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..785ae7e85f17bfcfb8b75188824f6e4c32020eeb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 12.089976671939015,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.168251102142501
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5731074158985243,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003284340506365394
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.44018576325041153,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030131809832668697
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4712653059021899,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023607480373166326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.275133195308103,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002740669736631083
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2075323986474321,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002196437540323409
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2227090352718732,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020690282260279855
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4165254470631004,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029803269219252386
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3172191245436191,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002462143659310004
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.34042049528282725,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021081402016807802
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4671938157159783,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003215098649092779
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3576660404019271,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002771243835460788
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.38335331690793345,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002363317168332221
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1f74811138381ef7983d00688e56cfe407bbd93
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.306767136731397,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20748195293257823
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.586146194582904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003201208783304345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.46837981221534686,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028821983932268993
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4959039342859941,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022179464050245366
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.29546214538365784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002760222751352884
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23282858226774875,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022643379007857164
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24660179707103722,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020875846431590893
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43453348333901143,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030164445891002808
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34493209177229056,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024926657554512834
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3658398555812216,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021228022591806505
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48901167662589756,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003188380289601738
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39060971772253245,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00277158241175323
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.41353486620349267,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023298219846166635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..23fc8123c6029d9f04938afa34157b5b0506f346
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.986652923553951,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14924649133846585
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5852147290185015,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003198801112514953
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47309130989416415,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002863282453776492
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49941178939651276,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002215810900097095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2992462347854545,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00279234169219994
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23892912708756728,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002326244957735514
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2520595099329764,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002137559032929213
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4362259991876925,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002981357963715629
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3508817024714458,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002481255460162326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37080740120712563,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021085500680046655
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.49286818963774864,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032037762403395406
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.398038444692037,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027747754778932373
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.42024778988515976,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023378975699189846
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9c2d1207971b2fa91301343fcae5e05884eedd9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.355039272215512,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1397449379312914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5790492697063228,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003163418243454001
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47626870304120433,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028295282078555388
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5006142198930905,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002241219932072369
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2964235601953411,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027302282806629004
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24084946363934556,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002279965123787374
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25325717049884455,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00212780022639949
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43244548404877137,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029330079057210594
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35437374939057237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002483948940307853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37278812205664297,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002143459855710583
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4902546597328058,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003150242604642586
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4030653858271246,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002753130148341471
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.42378128466807036,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023597511181794963
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..853ea97e02e37bd046b56201e08ee68f176fa0ec
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.51316933790889,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14131860296901558
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.578638440566317,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031479626596216882
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47971897395973623,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027570588512589957
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5033945465560786,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002201297123291935
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.29709476038507515,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027367014589648156
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2428822828451652,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002252237417006984
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2553196283460093,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002122343708414921
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4303055719591573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028832325276234383
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3558059120429404,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024373374637128653
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3734887555835707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002105068605546113
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48802659063198645,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031316303520488075
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4043415445372068,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002685631053632136
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4244461237028647,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023223935862545925
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c99d9c6808d3c110c1999f6536652ded5fc9418b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 4.513948362755348,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.05324817854650952
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.18083865590618006,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0011967469680865424
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.39694748186124473,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0022358420017444636
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.2438649285193806,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001416529052761638
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.07655778766815691,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007880368223086152
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.1739929714611625,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017881791897208102
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.10422182142495252,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001031777045626439
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.15644810681042984,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.001029610825920162
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.34537695956020553,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002028676271570114
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.2113950802978589,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012433002277184076
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.15748716535088403,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0011172995166076403
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3471846365481522,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0022023841121810756
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.2126259133049415,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013551634598868996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ad342c91cbb93a996a9c651e8a52ade6b534b45
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 12.021866804949632,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1701742092769703
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5627005067366511,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033130096861858074
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4277249029488458,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029565132907154986
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4599877335951275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023317871941812207
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.2752724169297304,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027859200368061665
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.2051992569808937,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021714381628735483
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.22114544896184524,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00204967320559472
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4175635896665314,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030646772446956514
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.31431498620706183,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024553483011121235
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3388960991118605,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002107043508551377
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4639451945129169,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032545362146494264
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3511223032556959,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027254287026514854
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.3780995133274374,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023233337570530547
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fed4df7f1b23e1fad53b9f4612b91a659c34ff7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.565233063876459,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1885705674727478
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5837242686841533,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033033580242790486
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.44540220460576185,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002868658378007867
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.48052681154587185,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00228757243533114
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.29915385446115245,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002894369294200601
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22399295380543538,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022420057498064956
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.24202094624513124,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021223172573097914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4399247896434536,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003079420403202306
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3333920863937247,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024659518407944923
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3602945422534893,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021337702090140783
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4898081103762298,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032778129938177834
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3727973962332129,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002724416991600159
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.40249470346290595,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023435648489988424
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cdaf0bcb90fad758ecec0fda477295b89b73998
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.786595723877019,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16700801225608986
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.584661321143836,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032655523852631084
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4488272835866765,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028798866593971774
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4829072150204214,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022431825256339155
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.3002028921692319,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028661199259928725
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22756205456066345,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023122295166738216
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.24452076890554655,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021341422736736015
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.44001019373241995,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030981399154380532
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.33554664599715434,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024937602834115497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3616068258766326,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002140335910026705
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.49080380963239784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032481844726190176
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3763660815881047,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002741120855013435
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.40495369605288095,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002311367860372464
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd762f03a55dd98efcb6f86afd5ad1c8aea54b4f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.740418727123535,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.125168592466937
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5845829081839676,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032404783946119488
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4449834221000959,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002791619267995256
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.48272753824834136,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00224712531324722
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.30003225779147014,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028079090189879175
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22533554720244608,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022354658494476848
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.24445326336589657,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021129956156590367
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4403309294540874,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030238155742260196
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3341276065293173,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024525407514030487
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3627627657925229,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002165015417177649
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4923570162452141,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032397034691861526
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3745534207008274,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002675901478552923
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4064464779608034,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023451178623913123
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e110a34519d83ce390d0215f04ef36179df728cf
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.656099683614805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17838730696724606
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.580946458312432,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003238803203078521
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.44425456017925374,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00269888517548937
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4813817328517561,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022006369642601567
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.29685416578460727,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002799255589899561
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22352618776361244,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021763786055710088
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.24244538824782974,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020844313178218616
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.43409102063959965,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029438891102049375
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3315067878833193,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023785090846630617
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.35914212757736946,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002092896469507669
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4862732685071888,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031997511868591664
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.371675962232947,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026174808107856332
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4027374392436005,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002301106330222583
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6215f5c176bc0b3cc1bbd10f74f9b551138d7054
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.10974602642941278,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0016265580165918348
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.2751641641925161,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003764947977083456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.1550455332379995,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002188766283324984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.01715867033956244,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0007491894700598756
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.044269433075030035,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0019355166036210034
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.02442715447547206,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0010547549811059692
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.08537711294726719,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0011493677678798603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.21573286471987912,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002778168194611946
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.12085117006455655,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0015528468048302269
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.08875361641538126,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0013008002608651363
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.22493786700312102,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003175265648492463
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.12575944814048232,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0017727719225543287
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.826388517581962,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07659868838211477
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a92fd6f4f2bfc36d1d825537657b573af7f1fbab
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.11235330499116508,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001825855414930503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.17726301733586447,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003322946482029751
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.12859680838214518,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0019817697277987206
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.0069436817076779156,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0005112708010543145
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.012853271119216214,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0010000150628169575
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.008505040157174723,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0006200781636943587
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0851208198632456,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013732771890519564
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.13231394019781906,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00231252669675569
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.09669731285510633,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0014118648748043154
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0895178827093416,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0014331191943847438
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.14231732687022683,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002682773387227301
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.10270197895040037,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0015712774454742433
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.48219162434969726,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07477715903961155
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..74c775c6aa0d36d7f5b906fff94456e80f07d095
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.1350415941925478,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.002492146653666393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.18324113752555313,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0035029398409540742
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.14430978218128562,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0023207134123328387
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.01416051725563546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0009707528234085293
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.021669313236756357,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0013707350830651868
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.015577366772524866,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0009663775690245467
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.10374039906210167,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0018277864057013675
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.14027716643303273,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0025856202484314085
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.11055239822249753,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0016705958650524591
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.10595316244383789,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001864959233883817
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.14515880876640858,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002838795872969232
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.11349384281168248,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0017697409720081902
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.6747495040718029,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.05447974028945998
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a6d3a15be943f6bbb37652484701d3a9de31777
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.15983388686624506,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003478411038525392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.21363418868995598,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0044052794440733265
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.16669766406043368,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0030859704820920294
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.026074826848387106,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0014952152913898485
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.03800452360584031,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0019746449121423996
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.027764709643064052,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001423724925135889
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.1212242849057896,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0026148303202884774
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.16165123522451272,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033030332185164856
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.1259471852764778,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0022633889318015
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.12424904095082498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0026319767934252754
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.16847726574481212,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003558450955464435
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.13002186162938917,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023502331314551616
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 1.3490765997572918,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.07452514524336995
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3190b25c3e4fbdbc3bb91de4639c2ef49a124a66
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.04907387724300002,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003234429189684823
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.05731059029513362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0036785998464218513
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.046439855607119385,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00284410839477631
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.009713544524476316,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010351152599463742
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.012358529122023391,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0012249958901476917
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.00959324918731049,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0009409652948725221
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.038493398405894705,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002592388373637889
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.04381815286105373,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002801768957069223
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.03582073650035805,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021980490984669808
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.03939008585534109,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0026368066863757503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.045440716368802495,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002954025403461032
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.036871071543239965,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0022698500974961127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.24721291868418202,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.049588019713429736
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b86f827e16f73a6a28001446df4b5bf2bc7abb3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0012123554660875486
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 5.723912790464725e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 4.0479309228269615e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.00011077758719268152,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 7.833875430317281e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0012123554660875486
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 5.723912790464725e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 4.0479309228269615e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.00011077758719268152,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 7.833875430317281e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0012123554660875486
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 5.723912790464725e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 4.0479309228269615e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.00011077758719268152,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 7.833875430317281e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..af8d0796520e96525b48cdb32e4b93a33f9135d1
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.14892453306484066,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0019184491411793764
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.3528941190068972,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0043118852716188955
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.20671568072139454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002528806889301289
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.03386350876730052,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010981805988616712
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.08363981215451943,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002723064071845637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.04750004978424984,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015203677000594135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.11077982199983927,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.001386865922889612
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.2637562201881419,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032521028737830106
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.15388992488912057,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018329285326789228
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.11844827404779884,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0016239153417563323
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.28238722274619205,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037749607667179816
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.16468265983100272,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0021669575359935553
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.875432863383546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.062082888497796286
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..55ccf2499e9e784854fa7d3107b724c006e678fd
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.19960964828555527,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004107575752866345
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2157428016298934,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004055779615673104
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.19119281858963183,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033101491874716737
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.038187194305397985,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0022349877900683045
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.04011310335796683,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0020275784302354574
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.03568595727216617,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0018632965400610909
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.15277349949813956,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0032872529807419252
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.16396576376162655,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031054755283307263
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.14564552170447934,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025798467681516775
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.1550209051635969,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003284288318473702
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.1688330220331253,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032920982306984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1485833712808272,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026125773661705726
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.8941294842971168,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13215157825403162
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f102f6daae9840802997862b98b5c6dc65be29b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.26689392594772454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004337626289582131
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2475592856825416,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003872465100596751
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.2428878538323252,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003472823044516277
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.06403837156878552,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0026911466509326534
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.05835175860926151,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023224836030261964
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.05762339608594923,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002281322761921692
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.20230244190777438,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0036014595491025043
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.1862105332580683,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030118849618256224
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.18314939874317032,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0028003189953992335
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.2037770460842929,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003596337844803637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.18900362833831452,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0031234507389558894
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.18499281034459034,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028200577177087296
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 3.3498604987705147,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.19596177585353966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..39caae6aae2f623f25093d9b3578c25c090e919f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.26286944517436134,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004547016993765234
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.23922210627832416,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00399242210516383
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.23937434803922442,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0038022538947401112
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0676498945961149,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002738019053264675
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.06043520173252685,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023483245889642074
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.06083246734082769,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0023310201660808163
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.19937399365009664,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003777710368463551
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.18013165514185406,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003193049282501879
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.18064028651608502,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.003098457083973783
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.20046547390236114,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0037863004323181425
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.18165174901569578,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032501434351904436
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.18183242521164045,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0031193531188508203
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 3.299640404449182,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.22526369317807593
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..898ad0f62d92969eee95861d06c8912e85fe0c3b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.07165582917605977,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004574396501698046
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.055218754309569865,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00340554578019057
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.05800954094661802,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003496221478335308
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0170476050564283,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016378678918016383
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.013811166550490082,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001320549075325777
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.014481860106880176,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013623544383952462
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.056339128495289034,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003806969072311165
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.042227353071765686,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002652946223493758
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.04451543084785548,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002727104103466008
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.05702888046338904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003844631247600613
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.0429696011839616,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0027150651579781033
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.04521446022524568,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027776518676359114
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 0.06983354124732599,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.02299895135254461
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2c30a973a0ee97ce076fc21eac35709c232ae69
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.0018831909793706626,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0005784722940521672
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.0013454709318106806,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0004064972952939784
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.0015297630264254271,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00046352326818864206
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.00010890576928312776,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 7.757501140277646e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 7.004002287021154e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 4.9514840646779456e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 8.502847929189122e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 6.021299119569327e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.001468303303617866,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00046494145314115326
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.0010215023772211784,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00031150675030788545
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.0011680695487629835,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0003584965566749215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.0015790808908105478,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0004907819897375802
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.0011142872267091062,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0003381488629476795
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.0012690548984618838,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00038605695478641824
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 6.30933023510575e-46,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 2.4028369955982456e-38
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..03612b9dafb7e6a408403a1645b2566fb5ef50d0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1357849290108061,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0022085026440544792
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.31125110203110956,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00478150841719741
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.18544940835038826,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002792598389230579
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.02806264964322551,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010490970791455262
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0676376603721838,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002556389169521471
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.03905778248153537,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014517464622913717
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.09941478269511335,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0016710762156098235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.22831881338494908,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035475652964304587
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1355227325800368,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0019961537320126754
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1090062194949961,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001862164401400258
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.2506136306420929,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004002812528613584
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1487940648277913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0022923438101904283
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.6078680919808515,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13305526786747834
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..aecd17207cef6e034ed14c66154276dfba6d6337
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.18487966767987538,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0036696902142416914
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.26149928474217465,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004228571030201456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.19812664092391155,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003093672497471887
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.038450464485312766,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018721013867502832
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.055093479727191086,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024314883171235226
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04107507874444822,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001790944203843995
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1407652383762195,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002867258911637599
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.20074896873173023,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033400948802297637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15109086002249855,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0024037820360525127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.14290226133870376,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002899542963069592
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.20559699548522795,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035828941817263715
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15400915124743492,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0025002059384586265
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7426173208536198,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06390989391271595
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ab1e8e5a07d2f9597474e444bb2cff0b13d8155
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.20654064116685483,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00402670019732593
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2761636603625548,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004261678708829253
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21646360740135792,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003295332693289183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04926519014044061,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002298517760897842
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06300838426900698,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025900885284329783
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05024654283296407,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0020978860757402073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.15783804831230816,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0033092482048118115
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.21095994736677962,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003416543139413067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16496562440527163,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026851454383202047
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16058710601356815,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033312368166406067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.21694184867744362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003641660551599739
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16854842502615797,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027533826606543164
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.212042173147054,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.14754927518486374
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a27fdc75970dd0e7eb06f086272da4dbcb43644
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.20968934404830927,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004355318865286731
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2714957979387898,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004428046228223116
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21602695443039088,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003545745499002209
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.051495808432128344,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002483343433806453
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0626272643756316,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025430698963779036
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05092039724374764,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0021028017555591145
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.16001543314957897,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003518230552839032
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.20782119995033888,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035214877778981
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16465282938122464,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002817348856079681
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16265225378046935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003535398294040215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.21338945027387293,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037313572424076884
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16805945710868087,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028771833437224903
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.292440773010242,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.12688961626559822
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ebb1177c5104032ae4ec3d6068af56e8880b3cb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.066317893890864,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00423472373517398
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0668307970074077,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038903454010429882
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05844521610204345,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003357755822076642
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.01516582795790956,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016864721571447335
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.015164369380224488,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014785207392795535
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.013427211505942963,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013325070824321361
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.050958054485125935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0034077121993672244
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.05078596392429664,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030413833239327495
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04435132297085604,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026237858818432148
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05197463436429066,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003434452220019136
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.052735906009332555,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003177823933607375
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04569745564992877,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00269219477160947
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.34928748143158433,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06968377794297799
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b99e5dc8bb75a6e73ab9cddcbb83141868101a7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.002581036492851647,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007454396977849946
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0019439835173572402,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005504845967048701
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0021633187762348204,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0006160758089028403
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.00030731846769582613,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0001384434719538918
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00020070076673850258,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 9.047229228571321e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.00024003297882235707,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00010755582565714237
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0020398313639409388,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.000580520638938891
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0015747914931411474,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0004430161114171138
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0017320310946117893,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0004885245020078372
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.002194331414389935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0006175664744399804
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0017109237047450568,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0004811354022989026
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.001876404823891179,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0005270763478001978
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.845610532495598e-43,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 6.163279325803633e-37
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8dec65f96e26a58cdc92d2278a89f3e170daf0b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.1392916956409755,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018481196716208217
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.33138070505525874,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004276391177461391
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.19341295101992514,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0024605445250390295
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.027263388973015015,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.000983425389980829
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06778244092557704,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002505239823925805
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.038300051518121075,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001369595902243146
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.10118400957605353,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.001314306255628095
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.24197014523792498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003179571094954232
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.14060204154591033,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0017583084261119269
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.11110341692578951,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0015204648459418455
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.26610036052943387,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003666908475046308
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.15451525716329884,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0020462224153125503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.5021646844749303,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06803860105093382
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7cafe6990eca51d308e3d16aadb10b1615319a6e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.17917229015158215,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0034616270337872687
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.28234191998252756,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004328228914698034
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.20089012009350235,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002978496072546475
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.03775920828695508,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0017965171904386755
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.061221540970532014,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024602671401857466
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.042449076402835885,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0017537376058430483
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.13568899758643302,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0026852949458789316
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.21508610392050376,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003368556633613049
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.15212125887603897,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002276633466962906
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.13860339464985375,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0027145150651571065
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.2221076063699141,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0036838975194121463
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.15618747415464568,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023944466167414584
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.7438099023912716,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1601928555347937
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd223421a6ba7d4ed09eaaa8a3a43b1f2fcc9444
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.21314917496652097,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0040600258534636575
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.28017732093506925,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004156332947155909
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.220351016432514,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003188805586738867
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.05029543701564033,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002351457071629338
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06347492383046742,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002463338288106309
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.05038721991867873,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00201183920236601
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.16282719781485017,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0033585840074919944
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2130973674647334,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003244727756549482
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.16750321297579993,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025679565650771298
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.16587114072654197,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033789416242402844
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.21968584807863187,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003491606534581178
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.17145097615519603,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026315536730007077
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.0985070297675317,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13154446649102086
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd1e3f1dfc44b71f97a5cf51db479a6bffd36edb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.22280760154852117,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004390904652685638
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.2660670648202862,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0043755645907187345
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.22126778715622059,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00351049034903905
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.055569530372989805,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002485660853955144
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06528945138217346,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0026099692030146695
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.05451394163011369,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002197533206236956
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.17071198449258404,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003584396198060428
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.20296992347781656,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034513932905057944
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.16849308202989022,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002789201858785969
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.17303337856565984,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003592404654601901
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.20861557263054867,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003716319779676917
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.17168891280364598,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002856280916182101
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.696335421679655,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.15429316902371332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1eb8f917ede0378184e7cf9327af55f466a0af5f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.06459589021014682,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003891095380846983
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.0648529858041251,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038243852386048685
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.058187417979200434,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003298768398525437
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.01418629371385429,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.001531715516561685
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.014177449560852498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0013311123981377007
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.012582080149982773,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011999611107423268
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.04942622014780535,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003112611733184756
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.04830459159444718,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028657851487914514
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.04360619288210294,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002517973327420018
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.05058606223316683,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0031519936739739054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.050243110562926054,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003018354788190834
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.04491124074474275,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002579713572678555
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 0.24972558565783795,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.053684508836508174
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0195e8c1efcd467e6ffd4ad03b248db77a67178
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.0025190479973907745,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007080212578711208
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.0019522897365303632,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005315917820907907
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.002161934536709088,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005948885851124559
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.00038639243032022986,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0001470964395764042
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.0002587271219346691,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 9.980221995641359e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.000307333743214037,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00011743634313830978
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.0018826374863526564,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005147273086202652
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.0015177893314996913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00041368575160514705
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.001652183701042147,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0004498633984143077
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.0019969885440999403,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0005512037388198252
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.0015731204884741835,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0004244780256693147
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.001726760477833854,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0004678249512576879
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 7.469531234917508e-40,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 3.8389499820293147e-34
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..333bf2ef8a7549aeb4229a931427eb41f379c579
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.1455268959483828,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018622749607537018
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.3423730472851513,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004243968510683615
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.20150117996288208,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002475420919807577
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.031113013875646793,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010564472781666096
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.07657376393599181,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025908898115614702
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04364050860274179,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014638169607378785
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.1069927606611518,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013238092147911304
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.2531894635221878,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031552855238693913
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.14830049281611307,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.001767565901905731
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.11608704788644836,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0015422029588228266
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.2745676793472743,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003630899569864946
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.16095008318175014,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002069319454417293
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.69159177851078,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08597377916560887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d24e1d58ae078a2b2d09f39b800692ab54ecb4dc
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.1656766838106151,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003311707578975503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.26380436066405916,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0042896104700150565
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.18687918757076946,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029547400275911884
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.03192700954228503,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016638524448754533
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.05133067604137321,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023201541616321535
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.03611720929073555,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016831709482522717
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.12671804880247828,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0025399475086942853
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.20219533841657047,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003293526711844687
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.1427925316976916,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0022483842833851977
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.1300589316850642,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0025785593599623085
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.20963166196523544,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035927461486533713
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.14726996806804912,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023615534405494057
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.5154327790854452,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09075795006322013
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..27ab5cc38413aaae538391d1e68d21aa0bdb221c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.18909484566599685,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003950819844394485
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.2718769604920019,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004223865744923052
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.20341496217018784,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0032458847070410616
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.043554704955756154,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002176621642380098
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.05972277200498379,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025667116192052124
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.045462384799217964,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019968008810941603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.14613433508915188,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003224590842312493
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.20951638761821206,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033167476690494297
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.15660542083343884,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002596200719484403
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.14840970177848015,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003239839270586971
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.2149189211868496,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003570016109369942
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.15972610503232226,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026706170411476843
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.8561837473144438,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1652517173077056
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2c91fe578e590f82db4aa1b8ac413d6c4767d89
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.18257690791972123,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004006918517667934
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.26033673075849767,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004352519745393548
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.19598928489102377,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033814773326124085
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.040870316407448856,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0021374579637365066
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.056270108437116266,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025158970204257696
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04279947767402745,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019564637943375314
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.1409116045729758,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0031679784442123723
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.20275659130062612,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034509880571316465
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.15173273328212009,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002671616136858398
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.14292997261659537,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0031880065498972026
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.20768819850292705,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003681311843517383
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.1545405886375339,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027449821535318057
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 2.0809769345599323,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.19432487424847153
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f073feb2e88d4be4fbba3c9a47a96c27579acb19
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.05108032804071894,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003390049476133475
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.05880001625627721,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003736477018998282
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.04890767455000632,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003057134704734198
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.011543780490512906,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0013067634333148541
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.013097981096374263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001376718188766686
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.011057807884388515,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011751788248271782
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.04029696513661245,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0027502948998389862
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.045549874016116855,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002924162344624697
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.038159094630815064,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00243434744749608
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.04104440973285439,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0027735874936627284
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.04713611215249886,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0030466751631097844
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.03915726021159841,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024864361723350315
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.2880719761678462,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.05189985195943128
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f250dc49da36bddd932f9fc2d267029d81fbd750
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.004002287021154945,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00150882185401594
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.000505816634884062,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00018048741472614392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.0008888379700377073,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00031704658560943185
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.0008576329331046312,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0008576329331046347
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 3.430531732418525e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 3.430531732418596e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 6.597176408497164e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 6.597176408496901e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.003716409376786735,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0014258502483873398
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.0004553676388190836,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00015803630071122688
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.0008030746767272441,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0002802822184911789
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.003716409376786735,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0014258502483873398
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.0004553676388190836,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00015803630071122688
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.0008030746767272441,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0002802822184911789
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7139a70c73f6401c310857d1fb3f368e2abaf96
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 6.379963927829925,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2573526772203268
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.08430156983395772,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0023853160409478395
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.740846474029314,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006167016289652982
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.13918517262851055,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003031126931687362
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.06966123539471465,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0023674498306732014
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5868627439659466,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00770185112252964
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.1134449009766221,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003040297061765627
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.08383375028656737,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023771360831379928
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7379733730054571,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006199103560931277
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.13842755446553195,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003019229498486827
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.0822443257804446,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0023831466991385618
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7222343828686507,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006418079058535647
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.13560704839500504,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003029190011581066
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cce3172ca9e9db213975aea1b09bf5b35ba3369c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 57.42465098900179,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.1078815648831635
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.6808088885246102,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006796979519538104
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6336659473598535,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.007414216432858779
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6357497501654351,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.007151203933587638
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5255864223795965,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.008044938147511789
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5006490922843432,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.008204408297570282
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5007567674149286,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00806429537616059
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.6610225187736701,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006991919706073141
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6202751689797952,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.007617793392838218
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6211515568694629,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.007369923868292126
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.6647318479780407,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0069741652380140694
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6222421250530864,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.007583094680125747
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6234521192835725,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.007332794233638315
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9c8f28694da8edc967df471da0f59f81a517f85
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 60.44635224572798,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.3446237122314244
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7036876644842045,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006496630984767584
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6582369637194351,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.007108402892336969
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6632876537876221,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006816688459749599
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5507183829339078,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00788053127659401
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5241620180691751,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.008047211431302723
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5268362804678172,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007890623827590655
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.685976181176324,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006688246337688496
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6455843829048997,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0073101108152440475
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6497429564395274,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.007038662454206784
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.6895552326789159,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00666828093642199
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6474552209600517,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.007273618289358875
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6518983844565237,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0069968600613674855
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..43574b8253bc8d319b13618eb84291e982511c5a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 63.99859340473141,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.0181011163291809
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.718671836461919,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006294527586528868
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.685273079936742,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006773596698921195
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6872300555455244,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006552132289138089
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5718331533007336,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0077333134242707
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5520767659854839,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007877835743632268
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5527349338268147,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007756677211709758
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7031204259857672,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0064960038038979605
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6734324094649778,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006990313714994473
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.674737159574106,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006782676101560994
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7060532814193256,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006469915199530839
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6752154921932333,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0069521353853381936
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6766918854024231,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006741989553841211
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c5050e83cf797bd68356dd1ae6b9a07f2819496
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 65.68343284307088,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.4512625096573333
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7245064757365981,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006186217724643701
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6966042289172039,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006598392835454942
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6972916814165189,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006402402634467684
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5796069278121242,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007670281267649787
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5620584766606035,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007812313870631962
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5623294588317255,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00768450794995948
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7098784243871913,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006413128988144364
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6853303584523095,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006832123614751252
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6854739713954158,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006648892928897857
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7125589718597797,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006374119063501137
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6870178382316655,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0067907383524628976
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6872870920135151,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00660382042497619
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..37647866ca5baa6c0fe60f14d3bbcb5c787dd080
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 67.80635157272806,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.150124553790288
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7319220129062811,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006041436418049977
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7105094894163213,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0064230522240220625
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7098871630878016,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006221650632659693
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5897715500763321,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007568603611022888
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5762294144509804,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007700758909051833
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5754350955997378,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0075768108295479145
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7186482704494902,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006254631595816518
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6997515950999912,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006643987175394352
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.69879364437856,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006453508082021247
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7211579539003966,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006218818928365821
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7015037185292219,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006602458898563086
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7006368042688799,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006409941634775962
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd8b5d88e2b0b7b259fd691ee3a80dad02399950
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166526473007815
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166526473007815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e9d5b24bb1d8618053b0702e8f340afec05e945
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.499455930359086,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665817258899178
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.499455930359086,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665817258899178
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..62204e0fbbd75812776eac57f682d15b55fa12ed
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5114254624591947,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011662778026451666
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5114254624591947,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011662778026451666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..376d7e5a1803c0a99617a80f727f50788b912860
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5195865070729053,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011656869979288453
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5195865070729053,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011656869979288453
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eca79c3a14c0bb3c26156b3f9a80b66d288922e9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5087051142546246,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664055982032837
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5087051142546246,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664055982032837
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c81c7577c5b0c40cbac9713e07e5aec7ed0ecc2e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665713661738877
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665713661738877
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a1dd826cf317c9c3218a54c92bffa77c7215b79
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.1505467019035778,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.014682785810059218
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.020272307866543455,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0005265772258608504
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.2191478275652487,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004134750154842083
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.035170959301194286,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008338435093530204
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.0030507521899887356,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00015211877263159542
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.0381262159440338,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002020826886323144
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.005364179202157353,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00025695320904709255
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.018486089076030096,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00045586856654085276
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.20445528794712298,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0038441092026702847
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.03215186695687886,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000725133860372767
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.01668000026963013,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00042749970187600867
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.18840601223965645,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0037496927990254393
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.02901357418513821,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0006789454181899272
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b025a46f80bf7b9a10a265afec80555edabe07f8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.2943694159026868,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.019001956315490544
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.06889633960575121,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029528371645033075
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.1749622652355825,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00418913262605897
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07114356718637975,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024530989412474146
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.013747092252940312,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010815279565432835
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.034659257546783305,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0018737240504667597
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.014533470414966138,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001004705308911276
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.05678669759372553,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023995612619735897
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.15634194763841833,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003842783856016258
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06008732553084146,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002063583089286293
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.05799081392533638,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002547867930613753
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.1492710512090213,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003695060862834212
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.059790388949648735,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002132773374026426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8ac273645df7230fda5984d42bff167ed8c8b3c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.7429861146921837,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.041630683755388004
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.0782745309191772,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003661536035808258
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.08833366558571519,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035763050033546058
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.06384482992877637,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00270802656663024
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.018971204567379225,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001716087418066384
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.019513437693357994,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016266277324259893
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.015161629432520633,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001361177448233304
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06733854475743022,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0032077160777706985
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07767663804319452,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0032078055204207683
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.055271222595081285,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002403409792808055
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06993779483360749,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033484481247688168
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07770123233082768,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032126513659949513
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.05676099920154195,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024748610608637403
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..246c26c202eb0d6554cd4b8993526b37c15e2430
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.5886974488567144,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0588610972424531
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07828648309935715,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037910985558715117
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.0748434759683687,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003447795552872216
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.06127218818801527,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0027834226920066146
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.018883425449363365,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016912212541414398
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.018651177545109577,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017901304066660615
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.015325075589192747,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014230205586656989
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06753098472798698,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0033223511190554516
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.06602107534803005,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0031350941534849562
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.05326156438733179,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0024939890097848264
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.07000376450757803,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003449013306952803
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.06699980444105336,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003167802494403317
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.054652079076486976,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002541866737238746
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..41d87c3330c4486d02e4b1336325c943e685f6db
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.5677418183572901,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.030389830098839228
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.08869524418004159,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0039919536068413145
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.08221046365791854,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035462487851992963
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.070280450522492,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0030013676962094026
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.02184402587413536,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018288818965170844
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.01942179813273096,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017457514434629806
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.017234383744728854,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0015043166573897747
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.07691570690053545,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0035155841708412053
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07231895867275351,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0032076680327550494
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.061159781312434845,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002675699108061221
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.07972711816697133,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036660570172046473
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07381189517508352,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032437272615878235
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.06284895391649062,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0027355571624958492
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd74b7443951cca8a18cfd35bf2fcb568513e757
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6340158799904358,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.07074410475536543
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.0980231979267413,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004088794455570127
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.08967139314514878,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0036211300336253882
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07798760826694028,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0030806394748408095
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.023143447865316625,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0019008288816567988
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.02009787194324948,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017663286220115724
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.017977968875879748,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0015069906896766742
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.08522653187716864,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003609502601936802
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07920267912848071,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003267765504312832
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06817709804550225,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002749241681504631
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.08765190358636991,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037121088015769177
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07992387309151867,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032810895663960803
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.0695072665160449,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0027876585365403865
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..126f513377638b3f45450a37bc7570e154fc3cb2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.4956474428726877,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166538214464238
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.4956474428726877,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166538214464238
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..171d950b6d35bf752262c4a1bf85efd5b97e10ea
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5065288356909684,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664829595210969
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5065288356909684,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664829595210969
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0756d3b8850c54fc717dfd8b365ec81731c41ae2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5168661588683352,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011659185184878913
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5168661588683352,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011659185184878913
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..96c23c7b6f5a2d5625f97d5684b7de8eb308c131
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.514689880304679,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011660788281735494
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.514689880304679,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011660788281735494
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0810c6364188e0972a6efb8065681bdbd69df998
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665817258899168
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665817258899168
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fd6a4a24fa7b5a7a83f478aba428db953123248
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.4896626768226333,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663330673075898
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.4896626768226333,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663330673075898
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..da3fb8c7e975a61373484e8de6e32f205b6c4c88
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5718171926006529,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011544859155318846
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5723612622415669,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011543009623282832
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..53d929ce7393767181a5713d9b660b2ff7efd31e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5625680087051143,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011574126069682387
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5609357997823722,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011578865649321295
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf615108601a20d833c09582fcb8eeee3cd69d5b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.55930359085963,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01158347809065713
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5565832426550599,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01159088337366686
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3717432ef19cd518da4b50ae331084281f1b476
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5620239390642002,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011575720065594108
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5603917301414582,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011580417248656574
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed74c1c7ef6a313293314c1642a5ca04505a85aa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5609357997823722,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011578865649321297
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5544069640914037,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011596554080987647
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2485cad7a121551f3a1289d3b600138a76f83fd6
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5565832426550599,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01159088337366686
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5522306855277476,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01160199979686681
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f2f7388f089822926514534c37bcea3875286d3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.578,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01562562511262066
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.499,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01581926829057682
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..94b6f09fcbe3227492175edda3cee4e5dd51a2fa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.645,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015139491543780529
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.62,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015356947477797582
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9826863ecb865db23168aa0213f24336d6013c10
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.66,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014987482264363937
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.639,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015195720118175118
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d27de4af62d7cb03e125481616d825203ae526a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.666,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014922019523732961
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.651,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015080663991563098
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5589f3bafae58a7533ce862ad8808e5d6b37123b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.677,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014794927843348635
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.661,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01497675877162034
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1fc8109ecb066aecab84c2c6ddf805aa11eab51
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.682,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.0147340793093119
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.672,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014853842487270336
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0eca25e58de7d5205b5bf18c7bbaa617656e49ee
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.849,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011328165223341681
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.778,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.013148721948877364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..370e60f5946d8a133be833d56b549cb70b73dba3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.894,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009739551265785129
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.876,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.010427498872343968
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2cdce9851998fd879a6d92db793d0677d990a8b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.898,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009575368801653905
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.899,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009533618929341006
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6ac512bae5a729e8a44847de4d46985476540fc
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.906,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.00923305200078773
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.911,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009008893392651547
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f1f63ccf1790b0aca171b08efca01f1ea54c441
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.913,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008916866630745904
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.916,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008776162089491104
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e01a255eaade6ae793699cf2854fd1a28c5ed27
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Direct-Question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.913,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008916866630745902
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.924,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008384169266796386
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b55d46c6d838cc8d9b6f7411df84fbcedb4030a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.353,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015120172605483694
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.37,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015275252316519362
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..342fb9810d1e735c4895332b597669ec150da78e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.369,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015266698139154615
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.371,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015283736211823187
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6de67f5cfbc332471a36b321a9e14dc51278bb70
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.387,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015410011955493935
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.388,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015417317979911076
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b80d7b202e7be35226d0d57ebb1f5ffa8fe1d17
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.401,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015506109745498322
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.406,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.0155372264386346
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5e76b509559185e28fa0222c85d2355f24ad7ae
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.39,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015431725053866611
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.399,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015493193313162906
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4eed2dc7ecef0f87f955ffdfbff17af66ff36743
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.389,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01542455564730849
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.386,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015402637476784364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfae8cc3f93867c44af975a0c740fadb3d95b72f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.431,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015667944488173508
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.413,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015577986829936533
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b089f5f1d27c9f84474602d7079525f9f40b7bae
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.447,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01573017604600906
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.448,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015733516566347836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..721cc9f727bea83fc85dc63d3b6b73594d8da598
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.481,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01580787426850585
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.479,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015805341148131296
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..065e33bcfbbbd88b49d269f071665e32799f7880
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.511,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015815471195292682
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.512,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015814743314581818
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3550751c89530bd69bbf224d6cc70c370ecd3fe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.494,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015818160898606715
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.507,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015817749561843567
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..802b3d9163bdcaf12d288a573c28344d446fbef2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.499,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01581926829057682
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.501,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015819268290576817
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a62abcbc02d2c6a2bd0f7875a674368fbea27364
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.417,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015599819048769616
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.424,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015635487471405182
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a463d1cd56ab5a1b76feaead6d31d55d67450965
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.438,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01569721001969469
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.453,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015749255189977596
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ba40404a87836dd304f43384ca6c52d34b5571b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.438,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01569721001969469
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.439,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015701131345400774
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..623a16aef00f3cf4af055efee841ddb50c786059
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.442,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01571250721186421
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.437,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015693223928730377
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..280fa9bb8aa3261a8174021ec65e98de2191ff30
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.436,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01568917302314407
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.444,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01571976816340209
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..77374aef33c8e2f82d3dd8beabb781c53130a137
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.422,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015625625112620653
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.432,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.0156723202373362
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee1735643a9bc7818de413d67b7798c7b52be5fe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4751469802244789,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01154813982307477
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.5045430251202565,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561954965856519
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ae81abdb3eb4389dee053fdc5c81dcc30560883
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.5056119722073757,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011561703928784327
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.5114911811865313,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011559378273599128
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c454ce236355e5ab0bd818aae4327c64e6f5d16
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4853019775521112,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011557435464292914
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4922501336183859,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01156104327886354
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..83cb5589afdf7abb420b0d4665408de95fb7f40b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.47140566541956175,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011543509045585206
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.47835382148583644,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011551591851683337
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f46621e836f5c717c15767427d8fbb94610df7a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4778193479422769,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011551049647290314
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4735435595938001,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011546234813777395
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..20efc9dc9ae049a8308618d8d2e4f8f4e811dde4
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.46980224478888294,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01154132532033662
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4730090860502405,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011545573278697237
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f7f01a3247f2d37106dba541ecdab724e7c3e1e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.5034740780331374,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.0115621531491683
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.5200427578834848,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011553138977961008
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f249147844c54940c748766f4c04fdc89774745
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4794227685729556,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011552636515221856
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.5077498663816141,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01156104327886355
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5b61b6d8684e345965dac9ef76e33410522fc8d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.47621592731159806,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011549343521088358
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4719401389631213,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011544210396951672
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d512e26f461f27760898831e847cb870f9d7cd0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.47033671833244256,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011542066509767008
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4607161945483699,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011526690316014587
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..725d0afbb05501cf35052efd68ed70407c53df89
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4623196151790486,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011529552555884573
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4564404061998931,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011518470676766509
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b8213089a68fbd7cfac12ae088283cb7ce611fe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4623196151790486,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011529552555884575
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.46178514163548906,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011528611805439891
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4dd3d2ed8d31eadce5cbeb1e16e8b2446a5c99e1
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd3f723e1b9d7a8e12510b155f932bf7c3e9bf37
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b797e266318da13b3838f954a21714ab132024f8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..78389a4bce1b646b6e856c4da11614a969c06bf2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a79dcf6c66aa05fb776112418da4995a72a153f4
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..550db1d930eafd08c3697f4cff2fbbe1bd4debae
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8f1009349727f2f924bcdc7bb481db17c560f20
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4826296098343132,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011555452669106635
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.5024051309460181,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011562298481438055
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b567084926a355adbfe3dc9395bd071e3d3fbc5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4938535542490647,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01156155858904076
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.5141635489043292,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011557792331301674
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb9d7a2a23b08c112879879e1cf7c6b90e560f4a
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4853019775521112,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011557435464292918
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4922501336183859,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561043278863542
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7df8d178099677ce8f232648c3aaefc49e11660
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.47140566541956175,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011543509045585204
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.47835382148583644,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011551591851683335
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..be1b5961c4ba3dca907b1bdde341644bf4df38f8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4751469802244789,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011548139823074772
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4730090860502405,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011545573278697237
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0ec969b3fa6ea829fcfd8f37d3cd08e237f482e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4735435595938001,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011546234813777399
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.46980224478888294,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01154132532033662
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d714a569b1767f5cd08471fa3d371f4ae085c3e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4965259219668626,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01156215314916829
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.5146980224478889,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011557435464292923
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f32d32acbb8eb736f9f6a43bdb99115b753cbce7
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.47140566541956175,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011543509045585208
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4997327632282202,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011562430600098487
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b266c3ddc8cf2878745ce3b3096a5fb3819ed49
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4767504008551577,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011549925483927454
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.47033671833244256,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011542066509767012
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8fe7936a9587e1aac41f5a2ebb51b065c65a4416
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4708711918760021,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011542794417345719
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.46285408872260825,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011530479981182624
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7167b24e0cbd7d1f1b6aae77aaf32df47f4be8c0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.46392303580972744,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01153229486915312
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.45430251202565475,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.0115140402455835
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..06748d213da8cc123943eaa2b40021a8fe58f200
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.47835382148583644,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01155159185168334
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.46018172100481025,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011525709570367504
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..68b343d89175c782446ea1b8c067c4d50d0f2a84
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03003973059219781
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8a5e552b9aabe53986a1a5bdecdea9dc913b8aa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317184
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c1dfc321acb0ac3125fdca30d1f27f6d93c4a69
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03003973059219781
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5415162454873647,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029992535385373314
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..235b48160ae577672e89d8188c9380929638ceef
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030009848912529113
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1110fae519ca73d5ffd3243af860e49a9a1815fc
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5451263537906137,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029973636495415252
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5607485fdcff553dfe2ae617d4699d1e40517364
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.555956678700361,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029907396333795997
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030025579819366422
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb1f2cd8d2f20ff4f2a276efb8639a1f96990c34
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..665f48c5ce17e259aba8ef0b731f3e742a0632fd
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..55bd27a788498752836b7e07530b1d51ca49a718
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..69002af90ed5a081a10a8a72f963f246b678fb96
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb24928df1274120cedd4e00e772742b6137735d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030009848912529113
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5631768953068592,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.02985524739031494
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..244963660987a7243a4baef36f6beecaac53ee85
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030039730592197812
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..26cb93f2160af5c59e5191b521fc421f6be647bf
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebf696fe2270cdd7ee259c55c71822987cedec76
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.48014440433212996,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317194
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a7166506acebe0ecaeed8696fab9f93eef8b9c2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976633
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff84d1246e22c81a823b8a60dc8f2d10d133bf01
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..22e647a8eca229fd45877e84ba62d045d9ce9a93
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976626
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c9e5f663ffc6d0cd36ccf82f92dba1f98f4c923
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.4657039711191336,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030025579819366426
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..db94e7c5dddd0f92963b434eb1249a07c8ad1d9f
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030039730592197812
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..005dc5e38461ee819679dd345c1be3de3fb4b0d8
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..78834fec596f81e902d162c2884bafbaba1588fe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317177
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e08ff3b6afe1b582daf9df7a9cb4808f92487724
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..834c86c6bd921813478ddff0f65951f59d6bb04e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5487364620938628,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029953149241808943
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0cbc427c069d07491a7ff82f7a7ba42bc1a0ea5
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5415162454873647,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029992535385373314
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030009848912529113
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a56dd40f6e1a09c7a0165717f64d43e0f8e5dafa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..50385508eb45445cdd37ee4677366622c9d49089
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1efc73ac0d6cda51c0ca2a115b2642c5c3bc50fa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030063300411902652
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c459c8e23887477857c52b844367c292a2431087
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03006330041190266
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fc9a7e33df76816216a2a8ffe4b63b5381568ee
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f8368490d2da70b9789237cb8744fe9eeb151dd
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030039730592197812
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030009848912529113
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a09d942698733f120dfe236b488198a26e2ce59
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050555322824192
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.4925019731649566,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01405090552122858
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fa8558ad24f6c1de0c0872cc2226dc258862e05
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5280189423835833,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014030404213405784
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5098658247829518,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049749833367585
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b751d2b3a1ec9454856d51a33d154d9e8101ffe
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049294536290396
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.4980268350434096,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225632
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..841a1a5b5b3785c9669cb9312729beb913320592
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5209155485398579,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014040185494212949
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.500394632991318,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..97cf0b6fd0e99cd79377ebac9878cf56474dd23b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5114443567482242,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404880419985932
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616441
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac4fa58252cbc5c25a88ff7b853c3ae54edefdc3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_Replace_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5138121546961326,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047122916440419
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050555322824194
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5ba6ff73aac794dcd1954d2dda315257fa7cd7d
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4964483030781373,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405213114691586
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.0140519560640769
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..44f597d8699e53c4dba8eeb4ce168557c19c02aa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.489344909234412,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049294536290403
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.48382004735595896,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045126130978596
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c956cab935e629a815ac5d995a7087cae7ca843e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052131146915845
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529009
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5161863c4e6f22b0c460dbbd8330f80aee88f1c
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050555322824185
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052131146915841
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdb321c2d167287125f9f2c7ceccf3adf09b65b9
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5090765588003157,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050170094497704
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225629
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f54e6554757a70f834ece77e8b6a65233609b17b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_True-or-False_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049294536290396
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783661
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4da9fc4508a4d53289238440be3c0ec0e0e6eaa2
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.4925019731649566,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405090552122858
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.48855564325177586,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014048804199859322
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3c7c821d219715d2220d14432da3a2065eaa61b
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616441
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4964483030781373,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052131146915869
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8b84e8719c9fdda2b7775c0a638e4a7fb45e183
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052446290529015
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.49171270718232046,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050555322824189
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a675d1da5638319d917b0f3054a78c45ce153ac
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5098658247829518,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049749833367585
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4988161010260458,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..303c11a640cfa8cf4bf95be930a67c83185a3ec0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076903
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4988161010260458,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529012
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cdd657cb7cc0ef05305ba988aee70def94223008
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405090552122858
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051500838485807
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5bec7062fbca875f762f379259ec4dee629ecab
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616445
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.49329123914759276,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330346
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9c6072fcf9fb5260d40253bbbce7f136f9d9aa0
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5335438042620363,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014020826677598101
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5256511444356748,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01403398095610855
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5a1e9c7068a0b09618d15557d13d5d43cea8ad3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5232833464877664,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014037241309573642
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5201262825572218,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014041096664344327
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..08e860aa0364e468fe59bb5c2671fe1d55ba8766
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052376259225632
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5898148dd32cf7ba4c98788648ce1a3ea3289cb
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140451261309786
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5114443567482242,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014048804199859329
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ecdd93b4595450eb18ccbb63034c6224b3767af
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_stand-for_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616438
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5224940805051302,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014038257824059883
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_0.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1733b5e682ec50ba792980d276d648cfaf7298c3
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.489344909234412,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049294536290403
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.489344909234412,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049294536290403
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_1.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ae40c9fff2e3050157738170b26dce863d362aa
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.4877663772691397,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404827882040562
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.48382004735595896,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045126130978603
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_2.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2de2d3b582060fd3ae0d51d80d8437ebf33e5b79
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.48855564325177586,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014048804199859335
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.4861878453038674,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047122916440412
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_3.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c38f99da5ca9734eb8a0e8a90d8df26c2143c859
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052446290529022
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.4988161010260458,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529009
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_4.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5878096ec5b205e0c76059b3d8d6bad89209b6df
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050905521228584
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616436
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_5.json b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..32a94a035a5802f6bbe3b7ab08bc5d371f08579e
--- /dev/null
+++ b/4b284b17boscar/eval/slim.4b284b17boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5209155485398579,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404018549421294
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01404827882040562
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/merged.csv b/4b284b17boscar/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..d670c92a9f90ac0c9f983b72abc524b5a7a22e00
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/merged.csv
@@ -0,0 +1,53 @@
+dataset,fewshots,prompt,metric,value
+e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.012939399202474111
+e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.012939399202474111
+e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2227090352718732
+e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2227090352718732
+e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.24660179707103722
+e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.24660179707103722
+e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2520595099329764
+e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2520595099329764
+e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.25325717049884455
+e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.25325717049884455
+e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2553196283460093
+e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2553196283460093
+e2e_nlg_cleaned,5,average,multiple,0.2071477567205358
+gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03905778248153537
+gem_xsum,0,median,rouge2_fmeasure,0.03905778248153537
+gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04107507874444822
+gem_xsum,1,median,rouge2_fmeasure,0.04107507874444822
+gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05024654283296407
+gem_xsum,2,median,rouge2_fmeasure,0.05024654283296407
+gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05092039724374764
+gem_xsum,3,median,rouge2_fmeasure,0.05092039724374764
+gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013427211505942963
+gem_xsum,4,median,rouge2_fmeasure,0.013427211505942963
+gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00024003297882235707
+gem_xsum,5,median,rouge2_fmeasure,0.00024003297882235707
+gem_xsum,5,average,multiple,0.03249450763124344
+web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.039330605607392446
+web_nlg_en,0,median,rouge2_fmeasure,0.039330605607392446
+web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07545981652785559
+web_nlg_en,1,median,rouge2_fmeasure,0.07545981652785559
+web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.09627247599444268
+web_nlg_en,2,median,rouge2_fmeasure,0.09627247599444268
+web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.10183935852735193
+web_nlg_en,3,median,rouge2_fmeasure,0.10183935852735193
+web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.10691762381917162
+web_nlg_en,4,median,rouge2_fmeasure,0.10691762381917162
+web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.11512570203467995
+web_nlg_en,5,median,rouge2_fmeasure,0.11512570203467995
+web_nlg_en,5,average,multiple,0.08915759708514903
+wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03335939346289185
+wiki_lingua_en,0,median,rouge2_fmeasure,0.03335939346289185
+wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03413869729774695
+wiki_lingua_en,1,median,rouge2_fmeasure,0.03413869729774695
+wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06229028061393615
+wiki_lingua_en,2,median,rouge2_fmeasure,0.06229028061393615
+wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05709915142524781
+wiki_lingua_en,3,median,rouge2_fmeasure,0.05709915142524781
+wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01797607134766925
+wiki_lingua_en,4,median,rouge2_fmeasure,0.01797607134766925
+wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0031781048041287503
+wiki_lingua_en,5,median,rouge2_fmeasure,0.0031781048041287503
+wiki_lingua_en,5,average,multiple,0.034673616491936794
diff --git a/4b284b17boscar/evaluation/generation/merged.json b/4b284b17boscar/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..366cb71d1c400689483cf6d8294c0bd16a6285d8
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.30064593181269955, "bleu_stderr": 0.026879704157341512, "rouge1_fmeasure": 0.09060657840339362, "rouge1_fmeasure_stderr": 0.001978890806658563, "rouge1_precision": 0.06703651854735132, "rouge1_precision_stderr": 0.0024391085403758696, "rouge1_recall": 0.26740772484207054, "rouge1_recall_stderr": 0.004912155390010422, "rouge2_fmeasure": 0.039330605607392446, "rouge2_fmeasure_stderr": 0.0011751573991520296, "rouge2_precision": 0.028179323090803912, "rouge2_precision_stderr": 0.0013431147572720058, "rouge2_recall": 0.11491018942638491, "rouge2_recall_stderr": 0.0030470199083947793, "rougeL_fmeasure": 0.08688274387361468, "rougeL_fmeasure_stderr": 0.0018578198715780445, "rougeL_precision": 0.06406482680474013, "rougeL_precision_stderr": 0.0023017838075157946, "rougeL_recall": 0.257657677037235, "rougeL_recall_stderr": 0.004746012740112394, "rougeLsum_fmeasure": 0.0854912841791626, "rougeLsum_fmeasure_stderr": 0.0018955102722507356, "rougeLsum_precision": 0.0634475056304135, "rougeLsum_precision_stderr": 0.002344312215241603, "rougeLsum_recall": 0.24940060733167396, "rougeLsum_recall_stderr": 0.004590470603510631}}, "1": {"PALM_prompt": {"bleu": 0.5448014860907993, "bleu_stderr": 0.02354902424699172, "rouge1_fmeasure": 0.15412558821549818, "rouge1_fmeasure_stderr": 0.0035554299152960674, "rouge1_precision": 0.1388827638743681, "rouge1_precision_stderr": 0.004513090850395629, "rouge1_recall": 0.30199833689345784, "rouge1_recall_stderr": 0.004833949344945251, "rouge2_fmeasure": 0.07545981652785559, "rouge2_fmeasure_stderr": 0.002357043487541018, "rouge2_precision": 0.06750783374322028, "rouge2_precision_stderr": 0.002988686287368457, "rouge2_recall": 0.14947512582685754, "rouge2_recall_stderr": 0.0033695091348626585, "rougeL_fmeasure": 0.1394146722460062, "rougeL_fmeasure_stderr": 0.002993354560745065, "rougeL_precision": 0.12450746932788907, "rougeL_precision_stderr": 0.003982256290142074, "rougeL_recall": 0.2819898497969715, "rougeL_recall_stderr": 0.004422334130125733, "rougeLsum_fmeasure": 0.1421792554091385, "rougeLsum_fmeasure_stderr": 0.0031066004309073594, "rougeLsum_precision": 0.12750187523825526, "rougeLsum_precision_stderr": 0.004097239372173061, "rougeLsum_recall": 0.28488318300316834, "rougeLsum_recall_stderr": 0.004471716742740801}}, "2": {"PALM_prompt": {"bleu": 0.7494822896647901, "bleu_stderr": 0.045680162494902386, "rouge1_fmeasure": 0.18686493440413654, "rouge1_fmeasure_stderr": 0.004140752549886329, "rouge1_precision": 0.16651047001316224, "rouge1_precision_stderr": 0.00503164302116687, "rouge1_recall": 0.34879486947967936, "rouge1_recall_stderr": 0.00485184387176236, "rouge2_fmeasure": 0.09627247599444268, "rouge2_fmeasure_stderr": 0.0028826866977524927, "rouge2_precision": 0.08795085013666165, "rouge2_precision_stderr": 0.003446853896796009, "rouge2_recall": 0.1803167154721538, "rouge2_recall_stderr": 0.0037373578885484243, "rougeL_fmeasure": 0.16801781294867849, "rougeL_fmeasure_stderr": 0.0035458321298973234, "rougeL_precision": 0.14757163053942288, "rougeL_precision_stderr": 0.004365281542114343, "rougeL_recall": 0.32518786802891597, "rougeL_recall_stderr": 0.004501197296979544, "rougeLsum_fmeasure": 0.17158619573308823, "rougeLsum_fmeasure_stderr": 0.0036701761982919448, "rougeLsum_precision": 0.15178195561235863, "rougeLsum_precision_stderr": 0.004526255293354419, "rougeLsum_recall": 0.3286569936174265, "rougeLsum_recall_stderr": 0.004546078233004703}}, "3": {"PALM_prompt": {"bleu": 0.8754863248245969, "bleu_stderr": 0.04360822451403643, "rouge1_fmeasure": 0.19673949819066772, "rouge1_fmeasure_stderr": 0.004214599980941433, "rouge1_precision": 0.177665566463972, "rouge1_precision_stderr": 0.005213159352117334, "rouge1_recall": 0.36388817345982977, "rouge1_recall_stderr": 0.004942659870867192, "rouge2_fmeasure": 0.10183935852735193, "rouge2_fmeasure_stderr": 0.0028397205254593197, "rouge2_precision": 0.09644734799007487, "rouge2_precision_stderr": 0.003610059263232589, "rouge2_recall": 0.1890195299674566, "rouge2_recall_stderr": 0.0037165922348924205, "rougeL_fmeasure": 0.17564046576780104, "rougeL_fmeasure_stderr": 0.00357350787655052, "rougeL_precision": 0.15715897881351443, "rougeL_precision_stderr": 0.00455057510376714, "rougeL_recall": 0.3364361502529422, "rougeL_recall_stderr": 0.004479392424459559, "rougeLsum_fmeasure": 0.18055401062696452, "rougeLsum_fmeasure_stderr": 0.00373462611859697, "rougeLsum_precision": 0.16277909758999493, "rougeLsum_precision_stderr": 0.004769154955939382, "rougeLsum_recall": 0.3412001797439044, "rougeLsum_recall_stderr": 0.004532856553347463}}, "4": {"PALM_prompt": {"bleu": 0.9609553724467281, "bleu_stderr": 0.06219622144359429, "rouge1_fmeasure": 0.2035685591724469, "rouge1_fmeasure_stderr": 0.00420720523051007, "rouge1_precision": 0.18155711637430752, "rouge1_precision_stderr": 0.005153207013059504, "rouge1_recall": 0.37582358230783497, "rouge1_recall_stderr": 0.004868478990903738, "rouge2_fmeasure": 0.10691762381917162, "rouge2_fmeasure_stderr": 0.002903183299069371, "rouge2_precision": 0.09862370157443069, "rouge2_precision_stderr": 0.003550556891817434, "rouge2_recall": 0.19877038562348062, "rouge2_recall_stderr": 0.0037326666334346983, "rougeL_fmeasure": 0.1801505100456093, "rougeL_fmeasure_stderr": 0.003508323835548118, "rougeL_precision": 0.1583476022875235, "rougeL_precision_stderr": 0.00437564418742707, "rougeL_recall": 0.3452891933106154, "rougeL_recall_stderr": 0.004375446165777104, "rougeLsum_fmeasure": 0.1856568180747029, "rougeLsum_fmeasure_stderr": 0.003673260642261377, "rougeLsum_precision": 0.16428760554962601, "rougeLsum_precision_stderr": 0.004585243637248548, "rougeLsum_recall": 0.3517722828354071, "rougeLsum_recall_stderr": 0.004454120443642119}}, "5": {"PALM_prompt": {"bleu": 1.0578470733972574, "bleu_stderr": 0.06748885712032304, "rouge1_fmeasure": 0.21188453557964756, "rouge1_fmeasure_stderr": 0.004406461443397243, "rouge1_precision": 0.19481471590160654, "rouge1_precision_stderr": 0.005470101363136162, "rouge1_recall": 0.3790270896990674, "rouge1_recall_stderr": 0.0048619650256576654, "rouge2_fmeasure": 0.11512570203467995, "rouge2_fmeasure_stderr": 0.0030866056829496383, "rouge2_precision": 0.10948046927141483, "rouge2_precision_stderr": 0.0038119634607733702, "rouge2_recall": 0.20526211579883663, "rouge2_recall_stderr": 0.0038681925059066017, "rougeL_fmeasure": 0.18813765770684918, "rougeL_fmeasure_stderr": 0.003736931314323814, "rougeL_precision": 0.17018196029834431, "rougeL_precision_stderr": 0.004683975046675936, "rougeL_recall": 0.34939009051908676, "rougeL_recall_stderr": 0.0044237864033617015, "rougeLsum_fmeasure": 0.19423676766572612, "rougeLsum_fmeasure_stderr": 0.003908165471883654, "rougeLsum_precision": 0.17731841714776353, "rougeLsum_precision_stderr": 0.004926877660252843, "rougeLsum_recall": 0.3557595779694418, "rougeLsum_recall_stderr": 0.004485010413076965}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.2473591602494687, "bleu_stderr": 0.09068336197346967, "rouge1_fmeasure": 0.14941919223655403, "rouge1_fmeasure_stderr": 0.0023626814400315678, "rouge1_precision": 0.14048666426016918, "rouge1_precision_stderr": 0.0027632901821198767, "rouge1_recall": 0.20669383392760043, "rouge1_recall_stderr": 0.003301837819297362, "rouge2_fmeasure": 0.03335939346289185, "rouge2_fmeasure_stderr": 0.0009250316805971055, "rouge2_precision": 0.029525585854477905, "rouge2_precision_stderr": 0.0008710537995147101, "rouge2_recall": 0.04703290588131188, "rouge2_recall_stderr": 0.0014209190224071058, "rougeL_fmeasure": 0.11304744711913277, "rougeL_fmeasure_stderr": 0.0017089859184516304, "rougeL_precision": 0.10735133959204357, "rougeL_precision_stderr": 0.002260925698512594, "rougeL_recall": 0.15988100122471754, "rougeL_recall_stderr": 0.0025788969320249258, "rougeLsum_fmeasure": 0.1383102708440391, "rougeLsum_fmeasure_stderr": 0.0021868611093705326, "rougeLsum_precision": 0.1306418700473533, "rougeLsum_precision_stderr": 0.002629490470671884, "rougeLsum_recall": 0.19167920541216937, "rougeLsum_recall_stderr": 0.0030766125540393364}}, "1": {"tldr_en": {"bleu": 2.161551083223167, "bleu_stderr": 0.08752015895630673, "rouge1_fmeasure": 0.1614637512336107, "rouge1_fmeasure_stderr": 0.002154838127118863, "rouge1_precision": 0.20164190917278885, "rouge1_precision_stderr": 0.00340245779140814, "rouge1_recall": 0.18321322169620108, "rouge1_recall_stderr": 0.002763046996934259, "rouge2_fmeasure": 0.03413869729774695, "rouge2_fmeasure_stderr": 0.0010840564141073451, "rouge2_precision": 0.047278865796894595, "rouge2_precision_stderr": 0.0019439681128851241, "rouge2_recall": 0.03836443905737192, "rouge2_recall_stderr": 0.001290609856100645, "rougeL_fmeasure": 0.12354041881834915, "rougeL_fmeasure_stderr": 0.001615855049003565, "rougeL_precision": 0.15753098406464713, "rougeL_precision_stderr": 0.002833744217070429, "rougeL_recall": 0.14069990485042436, "rougeL_recall_stderr": 0.0021198152025286845, "rougeLsum_fmeasure": 0.1510952282258089, "rougeLsum_fmeasure_stderr": 0.0020029762719872084, "rougeLsum_precision": 0.1895150765616292, "rougeLsum_precision_stderr": 0.0032439901277073056, "rougeLsum_recall": 0.17138871604556788, "rougeLsum_recall_stderr": 0.0025668579918568207}}, "2": {"tldr_en": {"bleu": 3.6712694957538092, "bleu_stderr": 0.12335434340064529, "rouge1_fmeasure": 0.22448439980932258, "rouge1_fmeasure_stderr": 0.0022164668755803446, "rouge1_precision": 0.30853356659765047, "rouge1_precision_stderr": 0.003908986083294316, "rouge1_recall": 0.23611642752367193, "rouge1_recall_stderr": 0.0028098851869033524, "rouge2_fmeasure": 0.06229028061393615, "rouge2_fmeasure_stderr": 0.0013505011306650329, "rouge2_precision": 0.09422907936670104, "rouge2_precision_stderr": 0.0025861688595862934, "rouge2_recall": 0.0641343554561749, "rouge2_recall_stderr": 0.001505883040485307, "rougeL_fmeasure": 0.17363950691564434, "rougeL_fmeasure_stderr": 0.0017583318279909019, "rougeL_precision": 0.24269847651739596, "rougeL_precision_stderr": 0.0033468376687347985, "rougeL_recall": 0.18285960289057043, "rougeL_recall_stderr": 0.0022296534964715052, "rougeLsum_fmeasure": 0.2094330802436139, "rougeLsum_fmeasure_stderr": 0.0021028796887348792, "rougeLsum_precision": 0.2890613207429283, "rougeLsum_precision_stderr": 0.0037637245904156287, "rougeLsum_recall": 0.22021341816156315, "rougeLsum_recall_stderr": 0.0026435827841586675}}, "3": {"tldr_en": {"bleu": 2.8478598408877533, "bleu_stderr": 0.07777205711444814, "rouge1_fmeasure": 0.19386388642181623, "rouge1_fmeasure_stderr": 0.0025825739138006965, "rouge1_precision": 0.28082836293366026, "rouge1_precision_stderr": 0.004319205532930181, "rouge1_recall": 0.1968690045661053, "rouge1_recall_stderr": 0.0030792534914180656, "rouge2_fmeasure": 0.05709915142524781, "rouge2_fmeasure_stderr": 0.0014322906346376607, "rouge2_precision": 0.08943278692521785, "rouge2_precision_stderr": 0.00260423039924173, "rouge2_recall": 0.057174167865699795, "rouge2_recall_stderr": 0.001596148314592036, "rougeL_fmeasure": 0.15256193731747766, "rougeL_fmeasure_stderr": 0.002064934954063214, "rougeL_precision": 0.2250992736689434, "rougeL_precision_stderr": 0.0036669609547972885, "rougeL_recall": 0.15462182132992405, "rougeL_recall_stderr": 0.002462032347060667, "rougeLsum_fmeasure": 0.18280631870790667, "rougeLsum_fmeasure_stderr": 0.002455551930143459, "rougeLsum_precision": 0.2658819512511678, "rougeLsum_precision_stderr": 0.004162301116768716, "rougeLsum_recall": 0.18550776729783153, "rougeLsum_recall_stderr": 0.0029178715298313596}}, "4": {"tldr_en": {"bleu": 0.06105217041635524, "bleu_stderr": 0.010329575504157805, "rouge1_fmeasure": 0.06231197551494105, "rouge1_fmeasure_stderr": 0.002187598780650555, "rouge1_precision": 0.09422577510372415, "rouge1_precision_stderr": 0.0035020894909113, "rouge1_recall": 0.06252755874219122, "rouge1_recall_stderr": 0.0023819303265762287, "rouge2_fmeasure": 0.01797607134766925, "rouge2_fmeasure_stderr": 0.0009610791037997163, "rouge2_precision": 0.02901231903773154, "rouge2_precision_stderr": 0.0017246417352838742, "rouge2_recall": 0.01773347814345909, "rouge2_recall_stderr": 0.001029862057905482, "rougeL_fmeasure": 0.0496325144290003, "rougeL_fmeasure_stderr": 0.0017538529763009563, "rougeL_precision": 0.07680762398075403, "rougeL_precision_stderr": 0.0029646336358863275, "rougeL_recall": 0.0495501454165194, "rougeL_recall_stderr": 0.0018970706209682415, "rougeLsum_fmeasure": 0.05850840107044648, "rougeLsum_fmeasure_stderr": 0.0020558492019394463, "rougeLsum_precision": 0.08907486896592501, "rougeLsum_precision_stderr": 0.0033434435220432352, "rougeLsum_recall": 0.05871435609309791, "rougeLsum_recall_stderr": 0.002238019081316999}}, "5": {"tldr_en": {"bleu": 6.646244004651184e-16, "bleu_stderr": 6.685084737559834e-14, "rouge1_fmeasure": 0.010082013794884772, "rouge1_fmeasure_stderr": 0.0010101938209082168, "rouge1_precision": 0.01542645344389709, "rouge1_precision_stderr": 0.0015844345606781326, "rouge1_recall": 0.009780146620517932, "rouge1_recall_stderr": 0.0010592344698244329, "rouge2_fmeasure": 0.0031781048041287503, "rouge2_fmeasure_stderr": 0.00045646146249046847, "rouge2_precision": 0.00537468824664498, "rouge2_precision_stderr": 0.0008303851566942339, "rouge2_recall": 0.0029150772505705507, "rouge2_recall_stderr": 0.00043016549798545706, "rougeL_fmeasure": 0.00807413792108403, "rougeL_fmeasure_stderr": 0.0008184301651151722, "rougeL_precision": 0.012798829405832771, "rougeL_precision_stderr": 0.0013675936853478024, "rougeL_recall": 0.007665110474443101, "rougeL_recall_stderr": 0.0008164388748390192, "rougeLsum_fmeasure": 0.009445400067942783, "rougeLsum_fmeasure_stderr": 0.0009397610137449974, "rougeLsum_precision": 0.014620295797040817, "rougeLsum_precision_stderr": 0.0015055626328950474, "rougeLsum_recall": 0.009149077393940025, "rougeLsum_recall_stderr": 0.0009886149667351712}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.23009457158327395, "bleu_stderr": 0.032672420546843176, "rouge1_fmeasure": 0.03565768533846172, "rouge1_fmeasure_stderr": 0.0015248066303431285, "rouge1_precision": 0.046037847651470024, "rouge1_precision_stderr": 0.002049984498068587, "rouge1_recall": 0.034901976047314216, "rouge1_recall_stderr": 0.0015828043496248269, "rouge2_fmeasure": 0.012939399202474111, "rouge2_fmeasure_stderr": 0.0007270998548969809, "rouge2_precision": 0.016516540815237794, "rouge2_precision_stderr": 0.0011075729579088504, "rouge2_recall": 0.012463074267663072, "rouge2_recall_stderr": 0.0007148960008422395, "rougeL_fmeasure": 0.03123228667584005, "rougeL_fmeasure_stderr": 0.0013412818998006393, "rougeL_precision": 0.040481693666315224, "rougeL_precision_stderr": 0.0018321004056548362, "rougeL_recall": 0.03073851262251656, "rougeL_recall_stderr": 0.001408552293064241, "rougeLsum_fmeasure": 0.03473298460911775, "rougeLsum_fmeasure_stderr": 0.0014863724750131808, "rougeLsum_precision": 0.0451064653533249, "rougeLsum_precision_stderr": 0.0020199102881650523, "rougeLsum_recall": 0.033817754277814645, "rougeLsum_recall_stderr": 0.0015239128770440768}}, "1": {"generate_text_restaurant": {"bleu": 12.089976671939015, "bleu_stderr": 0.168251102142501, "rouge1_fmeasure": 0.4712653059021899, "rouge1_fmeasure_stderr": 0.0023607480373166326, "rouge1_precision": 0.5731074158985243, "rouge1_precision_stderr": 0.003284340506365394, "rouge1_recall": 0.44018576325041153, "rouge1_recall_stderr": 0.0030131809832668697, "rouge2_fmeasure": 0.2227090352718732, "rouge2_fmeasure_stderr": 0.0020690282260279855, "rouge2_precision": 0.275133195308103, "rouge2_precision_stderr": 0.002740669736631083, "rouge2_recall": 0.2075323986474321, "rouge2_recall_stderr": 0.002196437540323409, "rougeL_fmeasure": 0.34042049528282725, "rougeL_fmeasure_stderr": 0.0021081402016807802, "rougeL_precision": 0.4165254470631004, "rougeL_precision_stderr": 0.0029803269219252386, "rougeL_recall": 0.3172191245436191, "rougeL_recall_stderr": 0.002462143659310004, "rougeLsum_fmeasure": 0.38335331690793345, "rougeLsum_fmeasure_stderr": 0.002363317168332221, "rougeLsum_precision": 0.4671938157159783, "rougeLsum_precision_stderr": 0.003215098649092779, "rougeLsum_recall": 0.3576660404019271, "rougeLsum_recall_stderr": 0.002771243835460788}}, "2": {"generate_text_restaurant": {"bleu": 14.306767136731397, "bleu_stderr": 0.20748195293257823, "rouge1_fmeasure": 0.4959039342859941, "rouge1_fmeasure_stderr": 0.0022179464050245366, "rouge1_precision": 0.586146194582904, "rouge1_precision_stderr": 0.003201208783304345, "rouge1_recall": 0.46837981221534686, "rouge1_recall_stderr": 0.0028821983932268993, "rouge2_fmeasure": 0.24660179707103722, "rouge2_fmeasure_stderr": 0.0020875846431590893, "rouge2_precision": 0.29546214538365784, "rouge2_precision_stderr": 0.002760222751352884, "rouge2_recall": 0.23282858226774875, "rouge2_recall_stderr": 0.0022643379007857164, "rougeL_fmeasure": 0.3658398555812216, "rougeL_fmeasure_stderr": 0.0021228022591806505, "rougeL_precision": 0.43453348333901143, "rougeL_precision_stderr": 0.0030164445891002808, "rougeL_recall": 0.34493209177229056, "rougeL_recall_stderr": 0.0024926657554512834, "rougeLsum_fmeasure": 0.41353486620349267, "rougeLsum_fmeasure_stderr": 0.0023298219846166635, "rougeLsum_precision": 0.48901167662589756, "rougeLsum_precision_stderr": 0.003188380289601738, "rougeLsum_recall": 0.39060971772253245, "rougeLsum_recall_stderr": 0.00277158241175323}}, "3": {"generate_text_restaurant": {"bleu": 14.986652923553951, "bleu_stderr": 0.14924649133846585, "rouge1_fmeasure": 0.49941178939651276, "rouge1_fmeasure_stderr": 0.002215810900097095, "rouge1_precision": 0.5852147290185015, "rouge1_precision_stderr": 0.003198801112514953, "rouge1_recall": 0.47309130989416415, "rouge1_recall_stderr": 0.002863282453776492, "rouge2_fmeasure": 0.2520595099329764, "rouge2_fmeasure_stderr": 0.002137559032929213, "rouge2_precision": 0.2992462347854545, "rouge2_precision_stderr": 0.00279234169219994, "rouge2_recall": 0.23892912708756728, "rouge2_recall_stderr": 0.002326244957735514, "rougeL_fmeasure": 0.37080740120712563, "rougeL_fmeasure_stderr": 0.0021085500680046655, "rougeL_precision": 0.4362259991876925, "rougeL_precision_stderr": 0.002981357963715629, "rougeL_recall": 0.3508817024714458, "rougeL_recall_stderr": 0.002481255460162326, "rougeLsum_fmeasure": 0.42024778988515976, "rougeLsum_fmeasure_stderr": 0.0023378975699189846, "rougeLsum_precision": 0.49286818963774864, "rougeLsum_precision_stderr": 0.0032037762403395406, "rougeLsum_recall": 0.398038444692037, "rougeLsum_recall_stderr": 0.0027747754778932373}}, "4": {"generate_text_restaurant": {"bleu": 15.355039272215512, "bleu_stderr": 0.1397449379312914, "rouge1_fmeasure": 0.5006142198930905, "rouge1_fmeasure_stderr": 0.002241219932072369, "rouge1_precision": 0.5790492697063228, "rouge1_precision_stderr": 0.003163418243454001, "rouge1_recall": 0.47626870304120433, "rouge1_recall_stderr": 0.0028295282078555388, "rouge2_fmeasure": 0.25325717049884455, "rouge2_fmeasure_stderr": 0.00212780022639949, "rouge2_precision": 0.2964235601953411, "rouge2_precision_stderr": 0.0027302282806629004, "rouge2_recall": 0.24084946363934556, "rouge2_recall_stderr": 0.002279965123787374, "rougeL_fmeasure": 0.37278812205664297, "rougeL_fmeasure_stderr": 0.002143459855710583, "rougeL_precision": 0.43244548404877137, "rougeL_precision_stderr": 0.0029330079057210594, "rougeL_recall": 0.35437374939057237, "rougeL_recall_stderr": 0.002483948940307853, "rougeLsum_fmeasure": 0.42378128466807036, "rougeLsum_fmeasure_stderr": 0.0023597511181794963, "rougeLsum_precision": 0.4902546597328058, "rougeLsum_precision_stderr": 0.003150242604642586, "rougeLsum_recall": 0.4030653858271246, "rougeLsum_recall_stderr": 0.002753130148341471}}, "5": {"generate_text_restaurant": {"bleu": 15.51316933790889, "bleu_stderr": 0.14131860296901558, "rouge1_fmeasure": 0.5033945465560786, "rouge1_fmeasure_stderr": 0.002201297123291935, "rouge1_precision": 0.578638440566317, "rouge1_precision_stderr": 0.0031479626596216882, "rouge1_recall": 0.47971897395973623, "rouge1_recall_stderr": 0.0027570588512589957, "rouge2_fmeasure": 0.2553196283460093, "rouge2_fmeasure_stderr": 0.002122343708414921, "rouge2_precision": 0.29709476038507515, "rouge2_precision_stderr": 0.0027367014589648156, "rouge2_recall": 0.2428822828451652, "rouge2_recall_stderr": 0.002252237417006984, "rougeL_fmeasure": 0.3734887555835707, "rougeL_fmeasure_stderr": 0.002105068605546113, "rougeL_precision": 0.4303055719591573, "rougeL_precision_stderr": 0.0028832325276234383, "rougeL_recall": 0.3558059120429404, "rougeL_recall_stderr": 0.0024373374637128653, "rougeLsum_fmeasure": 0.4244461237028647, "rougeLsum_fmeasure_stderr": 0.0023223935862545925, "rougeLsum_precision": 0.48802659063198645, "rougeLsum_precision_stderr": 0.0031316303520488075, "rougeLsum_recall": 0.4043415445372068, "rougeLsum_recall_stderr": 0.002685631053632136}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.6078680919808515, "bleu_stderr": 0.13305526786747834, "rouge1_fmeasure": 0.18544940835038826, "rouge1_fmeasure_stderr": 0.002792598389230579, "rouge1_precision": 0.1357849290108061, "rouge1_precision_stderr": 0.0022085026440544792, "rouge1_recall": 0.31125110203110956, "rouge1_recall_stderr": 0.00478150841719741, "rouge2_fmeasure": 0.03905778248153537, "rouge2_fmeasure_stderr": 0.0014517464622913717, "rouge2_precision": 0.02806264964322551, "rouge2_precision_stderr": 0.0010490970791455262, "rouge2_recall": 0.0676376603721838, "rouge2_recall_stderr": 0.002556389169521471, "rougeL_fmeasure": 0.1355227325800368, "rougeL_fmeasure_stderr": 0.0019961537320126754, "rougeL_precision": 0.09941478269511335, "rougeL_precision_stderr": 0.0016710762156098235, "rougeL_recall": 0.22831881338494908, "rougeL_recall_stderr": 0.0035475652964304587, "rougeLsum_fmeasure": 0.1487940648277913, "rougeLsum_fmeasure_stderr": 0.0022923438101904283, "rougeLsum_precision": 0.1090062194949961, "rougeLsum_precision_stderr": 0.001862164401400258, "rougeLsum_recall": 0.2506136306420929, "rougeLsum_recall_stderr": 0.004002812528613584}}, "1": {"article_DOC_summary": {"bleu": 1.7426173208536198, "bleu_stderr": 0.06390989391271595, "rouge1_fmeasure": 0.19812664092391155, "rouge1_fmeasure_stderr": 0.003093672497471887, "rouge1_precision": 0.18487966767987538, "rouge1_precision_stderr": 0.0036696902142416914, "rouge1_recall": 0.26149928474217465, "rouge1_recall_stderr": 0.004228571030201456, "rouge2_fmeasure": 0.04107507874444822, "rouge2_fmeasure_stderr": 0.001790944203843995, "rouge2_precision": 0.038450464485312766, "rouge2_precision_stderr": 0.0018721013867502832, "rouge2_recall": 0.055093479727191086, "rouge2_recall_stderr": 0.0024314883171235226, "rougeL_fmeasure": 0.15109086002249855, "rougeL_fmeasure_stderr": 0.0024037820360525127, "rougeL_precision": 0.1407652383762195, "rougeL_precision_stderr": 0.002867258911637599, "rougeL_recall": 0.20074896873173023, "rougeL_recall_stderr": 0.0033400948802297637, "rougeLsum_fmeasure": 0.15400915124743492, "rougeLsum_fmeasure_stderr": 0.0025002059384586265, "rougeLsum_precision": 0.14290226133870376, "rougeLsum_precision_stderr": 0.002899542963069592, "rougeLsum_recall": 0.20559699548522795, "rougeLsum_recall_stderr": 0.0035828941817263715}}, "2": {"article_DOC_summary": {"bleu": 2.212042173147054, "bleu_stderr": 0.14754927518486374, "rouge1_fmeasure": 0.21646360740135792, "rouge1_fmeasure_stderr": 0.003295332693289183, "rouge1_precision": 0.20654064116685483, "rouge1_precision_stderr": 0.00402670019732593, "rouge1_recall": 0.2761636603625548, "rouge1_recall_stderr": 0.004261678708829253, "rouge2_fmeasure": 0.05024654283296407, "rouge2_fmeasure_stderr": 0.0020978860757402073, "rouge2_precision": 0.04926519014044061, "rouge2_precision_stderr": 0.002298517760897842, "rouge2_recall": 0.06300838426900698, "rouge2_recall_stderr": 0.0025900885284329783, "rougeL_fmeasure": 0.16496562440527163, "rougeL_fmeasure_stderr": 0.0026851454383202047, "rougeL_precision": 0.15783804831230816, "rougeL_precision_stderr": 0.0033092482048118115, "rougeL_recall": 0.21095994736677962, "rougeL_recall_stderr": 0.003416543139413067, "rougeLsum_fmeasure": 0.16854842502615797, "rougeLsum_fmeasure_stderr": 0.0027533826606543164, "rougeLsum_precision": 0.16058710601356815, "rougeLsum_precision_stderr": 0.0033312368166406067, "rougeLsum_recall": 0.21694184867744362, "rougeLsum_recall_stderr": 0.003641660551599739}}, "3": {"article_DOC_summary": {"bleu": 2.292440773010242, "bleu_stderr": 0.12688961626559822, "rouge1_fmeasure": 0.21602695443039088, "rouge1_fmeasure_stderr": 0.003545745499002209, "rouge1_precision": 0.20968934404830927, "rouge1_precision_stderr": 0.004355318865286731, "rouge1_recall": 0.2714957979387898, "rouge1_recall_stderr": 0.004428046228223116, "rouge2_fmeasure": 0.05092039724374764, "rouge2_fmeasure_stderr": 0.0021028017555591145, "rouge2_precision": 0.051495808432128344, "rouge2_precision_stderr": 0.002483343433806453, "rouge2_recall": 0.0626272643756316, "rouge2_recall_stderr": 0.0025430698963779036, "rougeL_fmeasure": 0.16465282938122464, "rougeL_fmeasure_stderr": 0.002817348856079681, "rougeL_precision": 0.16001543314957897, "rougeL_precision_stderr": 0.003518230552839032, "rougeL_recall": 0.20782119995033888, "rougeL_recall_stderr": 0.0035214877778981, "rougeLsum_fmeasure": 0.16805945710868087, "rougeLsum_fmeasure_stderr": 0.0028771833437224903, "rougeLsum_precision": 0.16265225378046935, "rougeLsum_precision_stderr": 0.003535398294040215, "rougeLsum_recall": 0.21338945027387293, "rougeLsum_recall_stderr": 0.0037313572424076884}}, "4": {"article_DOC_summary": {"bleu": 0.34928748143158433, "bleu_stderr": 0.06968377794297799, "rouge1_fmeasure": 0.05844521610204345, "rouge1_fmeasure_stderr": 0.003357755822076642, "rouge1_precision": 0.066317893890864, "rouge1_precision_stderr": 0.00423472373517398, "rouge1_recall": 0.0668307970074077, "rouge1_recall_stderr": 0.0038903454010429882, "rouge2_fmeasure": 0.013427211505942963, "rouge2_fmeasure_stderr": 0.0013325070824321361, "rouge2_precision": 0.01516582795790956, "rouge2_precision_stderr": 0.0016864721571447335, "rouge2_recall": 0.015164369380224488, "rouge2_recall_stderr": 0.0014785207392795535, "rougeL_fmeasure": 0.04435132297085604, "rougeL_fmeasure_stderr": 0.0026237858818432148, "rougeL_precision": 0.050958054485125935, "rougeL_precision_stderr": 0.0034077121993672244, "rougeL_recall": 0.05078596392429664, "rougeL_recall_stderr": 0.0030413833239327495, "rougeLsum_fmeasure": 0.04569745564992877, "rougeLsum_fmeasure_stderr": 0.00269219477160947, "rougeLsum_precision": 0.05197463436429066, "rougeLsum_precision_stderr": 0.003434452220019136, "rougeLsum_recall": 0.052735906009332555, "rougeLsum_recall_stderr": 0.003177823933607375}}, "5": {"article_DOC_summary": {"bleu": 1.845610532495598e-43, "bleu_stderr": 6.163279325803633e-37, "rouge1_fmeasure": 0.0021633187762348204, "rouge1_fmeasure_stderr": 0.0006160758089028403, "rouge1_precision": 0.002581036492851647, "rouge1_precision_stderr": 0.0007454396977849946, "rouge1_recall": 0.0019439835173572402, "rouge1_recall_stderr": 0.0005504845967048701, "rouge2_fmeasure": 0.00024003297882235707, "rouge2_fmeasure_stderr": 0.00010755582565714237, "rouge2_precision": 0.00030731846769582613, "rouge2_precision_stderr": 0.0001384434719538918, "rouge2_recall": 0.00020070076673850258, "rouge2_recall_stderr": 9.047229228571321e-05, "rougeL_fmeasure": 0.0017320310946117893, "rougeL_fmeasure_stderr": 0.0004885245020078372, "rougeL_precision": 0.0020398313639409388, "rougeL_precision_stderr": 0.000580520638938891, "rougeL_recall": 0.0015747914931411474, "rougeL_recall_stderr": 0.0004430161114171138, "rougeLsum_fmeasure": 0.001876404823891179, "rougeLsum_fmeasure_stderr": 0.0005270763478001978, "rougeLsum_precision": 0.002194331414389935, "rougeLsum_precision_stderr": 0.0006175664744399804, "rougeLsum_recall": 0.0017109237047450568, "rougeLsum_recall_stderr": 0.0004811354022989026}}}}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f8a665c30c9fd2e4a6128ecbc47fe766c6c9843
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.30064593181269955,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.026879704157341512
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.06703651854735132,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0024391085403758696
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.26740772484207054,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004912155390010422
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.09060657840339362,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001978890806658563
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.028179323090803912,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0013431147572720058
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.11491018942638491,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0030470199083947793
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.039330605607392446,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011751573991520296
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.06406482680474013,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023017838075157946
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.257657677037235,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004746012740112394
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.08688274387361468,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018578198715780445
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.0634475056304135,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002344312215241603
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.24940060733167396,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004590470603510631
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.0854912841791626,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018955102722507356
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..35a47678e85bff979efc3619c5fb96fd5513b60e
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5448014860907993,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.02354902424699172
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1388827638743681,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004513090850395629
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.30199833689345784,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004833949344945251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.15412558821549818,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0035554299152960674
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.06750783374322028,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002988686287368457
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.14947512582685754,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0033695091348626585
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07545981652785559,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002357043487541018
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12450746932788907,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003982256290142074
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2819898497969715,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004422334130125733
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1394146722460062,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002993354560745065
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.12750187523825526,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004097239372173061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.28488318300316834,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004471716742740801
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1421792554091385,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0031066004309073594
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..855163d72a1183db34adfa337464c6d0cf6b5a2b
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.7494822896647901,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.045680162494902386
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.16651047001316224,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00503164302116687
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.34879486947967936,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00485184387176236
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.18686493440413654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004140752549886329
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.08795085013666165,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003446853896796009
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1803167154721538,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037373578885484243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.09627247599444268,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0028826866977524927
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.14757163053942288,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004365281542114343
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.32518786802891597,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004501197296979544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.16801781294867849,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0035458321298973234
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.15178195561235863,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004526255293354419
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3286569936174265,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004546078233004703
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.17158619573308823,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036701761982919448
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..03adbcc43c8ddc9ee1e75e6c9a11b924423041f6
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.8754863248245969,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04360822451403643
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.177665566463972,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005213159352117334
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.36388817345982977,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004942659870867192
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.19673949819066772,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004214599980941433
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.09644734799007487,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003610059263232589
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1890195299674566,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037165922348924205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10183935852735193,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0028397205254593197
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.15715897881351443,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00455057510376714
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3364361502529422,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004479392424459559
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.17564046576780104,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00357350787655052
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16277909758999493,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004769154955939382
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3412001797439044,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004532856553347463
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18055401062696452,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00373462611859697
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c67b1909a050c9897da4d951f8efc8e9f223911
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9609553724467281,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06219622144359429
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.18155711637430752,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005153207013059504
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.37582358230783497,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004868478990903738
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2035685591724469,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00420720523051007
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.09862370157443069,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003550556891817434
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19877038562348062,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037326666334346983
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10691762381917162,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002903183299069371
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1583476022875235,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00437564418742707
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3452891933106154,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004375446165777104
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1801505100456093,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003508323835548118
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16428760554962601,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004585243637248548
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3517722828354071,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004454120443642119
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1856568180747029,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003673260642261377
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b154f667bd1d12895b8bcbe5e6c2fb069ab1bf6
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0578470733972574,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06748885712032304
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.19481471590160654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005470101363136162
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3790270896990674,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048619650256576654
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.21188453557964756,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004406461443397243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10948046927141483,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0038119634607733702
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.20526211579883663,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0038681925059066017
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11512570203467995,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0030866056829496383
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17018196029834431,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004683975046675936
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34939009051908676,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0044237864033617015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18813765770684918,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003736931314323814
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17731841714776353,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004926877660252843
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3557595779694418,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004485010413076965
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19423676766572612,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003908165471883654
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..69cbe498b0ed9feff0ae3df772493fa0807f50f1
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.14048666426016918,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0027632901821198767
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.20669383392760043,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003301837819297362
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.14941919223655403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023626814400315678
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.029525585854477905,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008710537995147101
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04703290588131188,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014209190224071058
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03335939346289185,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009250316805971055
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.10735133959204357,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002260925698512594
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15988100122471754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025788969320249258
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.11304744711913277,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017089859184516304
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.1306418700473533,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002629490470671884
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19167920541216937,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030766125540393364
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1383102708440391,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021868611093705326
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.2473591602494687,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09068336197346967
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..92937dd9bfd02fa46ca67b4cd1909042e542b46a
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.20164190917278885,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00340245779140814
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.18321322169620108,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002763046996934259
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.1614637512336107,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002154838127118863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.047278865796894595,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0019439681128851241
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.03836443905737192,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001290609856100645
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03413869729774695,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010840564141073451
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.15753098406464713,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002833744217070429
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.14069990485042436,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021198152025286845
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.12354041881834915,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001615855049003565
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.1895150765616292,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032439901277073056
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.17138871604556788,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025668579918568207
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1510952282258089,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020029762719872084
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.161551083223167,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08752015895630673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c559f1f1940d611ccf212b5a6e1538f494294d36
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.30853356659765047,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003908986083294316
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.23611642752367193,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028098851869033524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.22448439980932258,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022164668755803446
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09422907936670104,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025861688595862934
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.0641343554561749,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001505883040485307
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06229028061393615,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013505011306650329
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.24269847651739596,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0033468376687347985
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.18285960289057043,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022296534964715052
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17363950691564434,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017583318279909019
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2890613207429283,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037637245904156287
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.22021341816156315,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026435827841586675
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.2094330802436139,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021028796887348792
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.6712694957538092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.12335434340064529
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2893198f46dfb03ef5f6e83b036a50391b0d748d
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.28082836293366026,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004319205532930181
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1968690045661053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030792534914180656
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19386388642181623,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025825739138006965
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08943278692521785,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00260423039924173
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.057174167865699795,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001596148314592036
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.05709915142524781,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014322906346376607
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.2250992736689434,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036669609547972885
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15462182132992405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002462032347060667
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.15256193731747766,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002064934954063214
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2658819512511678,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004162301116768716
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.18550776729783153,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029178715298313596
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18280631870790667,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002455551930143459
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.8478598408877533,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07777205711444814
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba5d3826c3ef95b2a72e1e243e56d7050d6df021
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.09422577510372415,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0035020894909113
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.06252755874219122,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0023819303265762287
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06231197551494105,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002187598780650555
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.02901231903773154,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0017246417352838742
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.01773347814345909,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001029862057905482
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.01797607134766925,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009610791037997163
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07680762398075403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029646336358863275
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.0495501454165194,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018970706209682415
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.0496325144290003,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017538529763009563
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.08907486896592501,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033434435220432352
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.05871435609309791,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002238019081316999
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.05850840107044648,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020558492019394463
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.06105217041635524,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.010329575504157805
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f13ca1635c2f06e6debdbc93df30cc85be1fa71
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.01542645344389709,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015844345606781326
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.009780146620517932,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0010592344698244329
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.010082013794884772,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010101938209082168
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.00537468824664498,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008303851566942339
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.0029150772505705507,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00043016549798545706
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0031781048041287503,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00045646146249046847
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.012798829405832771,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013675936853478024
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.007665110474443101,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0008164388748390192
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.00807413792108403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0008184301651151722
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.014620295797040817,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015055626328950474
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.009149077393940025,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0009886149667351712
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009445400067942783,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009397610137449974
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 6.646244004651184e-16,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 6.685084737559834e-14
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d12f8054ae0d4485b0a351760f52a97cd0035ee
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 0.23009457158327395,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.032672420546843176
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.046037847651470024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002049984498068587
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.034901976047314216,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0015828043496248269
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.03565768533846172,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015248066303431285
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.016516540815237794,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011075729579088504
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.012463074267663072,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007148960008422395
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.012939399202474111,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007270998548969809
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.040481693666315224,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0018321004056548362
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.03073851262251656,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.001408552293064241
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.03123228667584005,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013412818998006393
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.0451064653533249,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020199102881650523
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.033817754277814645,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0015239128770440768
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.03473298460911775,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0014863724750131808
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..785ae7e85f17bfcfb8b75188824f6e4c32020eeb
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 12.089976671939015,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.168251102142501
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5731074158985243,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003284340506365394
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.44018576325041153,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030131809832668697
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4712653059021899,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023607480373166326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.275133195308103,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002740669736631083
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2075323986474321,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002196437540323409
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2227090352718732,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020690282260279855
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4165254470631004,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029803269219252386
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3172191245436191,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002462143659310004
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.34042049528282725,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021081402016807802
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4671938157159783,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003215098649092779
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3576660404019271,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002771243835460788
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.38335331690793345,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002363317168332221
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1f74811138381ef7983d00688e56cfe407bbd93
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.306767136731397,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20748195293257823
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.586146194582904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003201208783304345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.46837981221534686,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028821983932268993
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4959039342859941,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022179464050245366
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.29546214538365784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002760222751352884
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23282858226774875,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022643379007857164
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24660179707103722,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020875846431590893
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43453348333901143,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030164445891002808
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34493209177229056,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024926657554512834
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3658398555812216,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021228022591806505
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48901167662589756,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003188380289601738
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39060971772253245,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00277158241175323
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.41353486620349267,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023298219846166635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..23fc8123c6029d9f04938afa34157b5b0506f346
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.986652923553951,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14924649133846585
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5852147290185015,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003198801112514953
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47309130989416415,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002863282453776492
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49941178939651276,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002215810900097095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2992462347854545,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00279234169219994
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23892912708756728,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002326244957735514
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2520595099329764,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002137559032929213
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4362259991876925,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002981357963715629
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3508817024714458,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002481255460162326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37080740120712563,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021085500680046655
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.49286818963774864,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032037762403395406
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.398038444692037,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027747754778932373
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.42024778988515976,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023378975699189846
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9c2d1207971b2fa91301343fcae5e05884eedd9
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.355039272215512,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1397449379312914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5790492697063228,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003163418243454001
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47626870304120433,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028295282078555388
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5006142198930905,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002241219932072369
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2964235601953411,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027302282806629004
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24084946363934556,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002279965123787374
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25325717049884455,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00212780022639949
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43244548404877137,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029330079057210594
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35437374939057237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002483948940307853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37278812205664297,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002143459855710583
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4902546597328058,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003150242604642586
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4030653858271246,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002753130148341471
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.42378128466807036,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023597511181794963
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..853ea97e02e37bd046b56201e08ee68f176fa0ec
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.51316933790889,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14131860296901558
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.578638440566317,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031479626596216882
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47971897395973623,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027570588512589957
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5033945465560786,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002201297123291935
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.29709476038507515,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027367014589648156
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2428822828451652,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002252237417006984
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2553196283460093,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002122343708414921
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4303055719591573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028832325276234383
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3558059120429404,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024373374637128653
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3734887555835707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002105068605546113
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48802659063198645,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031316303520488075
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4043415445372068,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002685631053632136
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4244461237028647,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023223935862545925
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_0.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..03612b9dafb7e6a408403a1645b2566fb5ef50d0
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1357849290108061,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0022085026440544792
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.31125110203110956,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00478150841719741
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.18544940835038826,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002792598389230579
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.02806264964322551,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010490970791455262
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0676376603721838,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002556389169521471
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.03905778248153537,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014517464622913717
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.09941478269511335,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0016710762156098235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.22831881338494908,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035475652964304587
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1355227325800368,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0019961537320126754
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1090062194949961,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001862164401400258
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.2506136306420929,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004002812528613584
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1487940648277913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0022923438101904283
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.6078680919808515,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13305526786747834
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_1.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..aecd17207cef6e034ed14c66154276dfba6d6337
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.18487966767987538,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0036696902142416914
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.26149928474217465,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004228571030201456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.19812664092391155,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003093672497471887
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.038450464485312766,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018721013867502832
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.055093479727191086,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024314883171235226
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04107507874444822,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001790944203843995
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1407652383762195,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002867258911637599
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.20074896873173023,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033400948802297637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15109086002249855,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0024037820360525127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.14290226133870376,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002899542963069592
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.20559699548522795,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035828941817263715
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15400915124743492,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0025002059384586265
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7426173208536198,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06390989391271595
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_2.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ab1e8e5a07d2f9597474e444bb2cff0b13d8155
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.20654064116685483,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00402670019732593
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2761636603625548,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004261678708829253
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21646360740135792,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003295332693289183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.04926519014044061,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002298517760897842
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06300838426900698,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025900885284329783
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05024654283296407,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0020978860757402073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.15783804831230816,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0033092482048118115
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.21095994736677962,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003416543139413067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16496562440527163,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026851454383202047
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16058710601356815,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033312368166406067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.21694184867744362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003641660551599739
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16854842502615797,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027533826606543164
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.212042173147054,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.14754927518486374
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_3.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a27fdc75970dd0e7eb06f086272da4dbcb43644
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.20968934404830927,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004355318865286731
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2714957979387898,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004428046228223116
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21602695443039088,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003545745499002209
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.051495808432128344,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002483343433806453
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0626272643756316,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025430698963779036
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05092039724374764,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0021028017555591145
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.16001543314957897,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003518230552839032
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.20782119995033888,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035214877778981
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16465282938122464,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002817348856079681
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16265225378046935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003535398294040215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.21338945027387293,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037313572424076884
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16805945710868087,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028771833437224903
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.292440773010242,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.12688961626559822
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_4.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ebb1177c5104032ae4ec3d6068af56e8880b3cb
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.066317893890864,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00423472373517398
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0668307970074077,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038903454010429882
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05844521610204345,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003357755822076642
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.01516582795790956,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016864721571447335
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.015164369380224488,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014785207392795535
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.013427211505942963,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013325070824321361
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.050958054485125935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0034077121993672244
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.05078596392429664,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030413833239327495
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04435132297085604,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026237858818432148
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05197463436429066,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003434452220019136
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.052735906009332555,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003177823933607375
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04569745564992877,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00269219477160947
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.34928748143158433,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06968377794297799
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_5.json b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b99e5dc8bb75a6e73ab9cddcbb83141868101a7
--- /dev/null
+++ b/4b284b17boscar/evaluation/generation/slim.4b284b17boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.002581036492851647,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007454396977849946
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0019439835173572402,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005504845967048701
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0021633187762348204,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0006160758089028403
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.00030731846769582613,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0001384434719538918
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00020070076673850258,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 9.047229228571321e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.00024003297882235707,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00010755582565714237
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0020398313639409388,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.000580520638938891
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0015747914931411474,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0004430161114171138
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0017320310946117893,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0004885245020078372
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.002194331414389935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0006175664744399804
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0017109237047450568,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0004811354022989026
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.001876404823891179,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0005270763478001978
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.845610532495598e-43,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 6.163279325803633e-37
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b17boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_0.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f39224e6db86a2afe924d3f208e24a3acc99d98
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_0.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.348,
+            "acc_stderr": 0.015070604603768408
+        },
+        "anli_r2": {
+            "acc": 0.337,
+            "acc_stderr": 0.014955087918653607
+        },
+        "anli_r3": {
+            "acc": 0.3408333333333333,
+            "acc_stderr": 0.013688600793296937
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.22058422058422059
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768077
+        },
+        "hellaswag": {
+            "acc": 0.4071898028281219,
+            "acc_stderr": 0.00490306663976195,
+            "acc_norm": 0.5113523202549293,
+            "acc_norm_stderr": 0.004988495127747283
+        },
+        "rte": {
+            "acc": 0.5523465703971119,
+            "acc_stderr": 0.029931070362939526
+        },
+        "winogrande": {
+            "acc": 0.5335438042620363,
+            "acc_stderr": 0.014020826677598096
+        },
+        "storycloze_2016": {
+            "acc": 0.6771779796900054,
+            "acc_stderr": 0.010812153082758843
+        },
+        "boolq": {
+            "acc": 0.5137614678899083,
+            "acc_stderr": 0.008741742106878659
+        },
+        "arc_easy": {
+            "acc": 0.5538720538720538,
+            "acc_stderr": 0.010200057828765008,
+            "acc_norm": 0.5033670033670034,
+            "acc_norm_stderr": 0.01025955089379893
+        },
+        "arc_challenge": {
+            "acc": 0.2380546075085324,
+            "acc_stderr": 0.012445770028026208,
+            "acc_norm": 0.2764505119453925,
+            "acc_norm_stderr": 0.013069662474252427
+        },
+        "sciq": {
+            "acc": 0.825,
+            "acc_stderr": 0.012021627157731968,
+            "acc_norm": 0.757,
+            "acc_norm_stderr": 0.013569640199177457
+        },
+        "piqa": {
+            "acc": 0.7230685527747551,
+            "acc_stderr": 0.010440499969334526,
+            "acc_norm": 0.7230685527747551,
+            "acc_norm_stderr": 0.010440499969334542
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f39224e6db86a2afe924d3f208e24a3acc99d98
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.348,
+            "acc_stderr": 0.015070604603768408
+        },
+        "anli_r2": {
+            "acc": 0.337,
+            "acc_stderr": 0.014955087918653607
+        },
+        "anli_r3": {
+            "acc": 0.3408333333333333,
+            "acc_stderr": 0.013688600793296937
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.22058422058422059
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768077
+        },
+        "hellaswag": {
+            "acc": 0.4071898028281219,
+            "acc_stderr": 0.00490306663976195,
+            "acc_norm": 0.5113523202549293,
+            "acc_norm_stderr": 0.004988495127747283
+        },
+        "rte": {
+            "acc": 0.5523465703971119,
+            "acc_stderr": 0.029931070362939526
+        },
+        "winogrande": {
+            "acc": 0.5335438042620363,
+            "acc_stderr": 0.014020826677598096
+        },
+        "storycloze_2016": {
+            "acc": 0.6771779796900054,
+            "acc_stderr": 0.010812153082758843
+        },
+        "boolq": {
+            "acc": 0.5137614678899083,
+            "acc_stderr": 0.008741742106878659
+        },
+        "arc_easy": {
+            "acc": 0.5538720538720538,
+            "acc_stderr": 0.010200057828765008,
+            "acc_norm": 0.5033670033670034,
+            "acc_norm_stderr": 0.01025955089379893
+        },
+        "arc_challenge": {
+            "acc": 0.2380546075085324,
+            "acc_stderr": 0.012445770028026208,
+            "acc_norm": 0.2764505119453925,
+            "acc_norm_stderr": 0.013069662474252427
+        },
+        "sciq": {
+            "acc": 0.825,
+            "acc_stderr": 0.012021627157731968,
+            "acc_norm": 0.757,
+            "acc_norm_stderr": 0.013569640199177457
+        },
+        "piqa": {
+            "acc": 0.7230685527747551,
+            "acc_stderr": 0.010440499969334526,
+            "acc_norm": 0.7230685527747551,
+            "acc_norm_stderr": 0.010440499969334542
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_1.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1271b27daa73bd9d04bc2b72c3a8ee83da9ec5d
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_1.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.356,
+            "acc_stderr": 0.015149042659306625
+        },
+        "anli_r2": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r3": {
+            "acc": 0.3416666666666667,
+            "acc_stderr": 0.013696658778002514
+        },
+        "cb": {
+            "acc": 0.32142857142857145,
+            "acc_stderr": 0.06297362289056341,
+            "f1": 0.24382716049382716
+        },
+        "copa": {
+            "acc": 0.76,
+            "acc_stderr": 0.04292346959909282
+        },
+        "hellaswag": {
+            "acc": 0.40380402310296754,
+            "acc_stderr": 0.004896563126116815,
+            "acc_norm": 0.522903804023103,
+            "acc_norm_stderr": 0.004984543540932338
+        },
+        "rte": {
+            "acc": 0.555956678700361,
+            "acc_stderr": 0.029907396333795994
+        },
+        "winogrande": {
+            "acc": 0.5469613259668509,
+            "acc_stderr": 0.0139903666321481
+        },
+        "storycloze_2016": {
+            "acc": 0.6691608765366115,
+            "acc_stderr": 0.010880601338204657
+        },
+        "boolq": {
+            "acc": 0.5321100917431193,
+            "acc_stderr": 0.008727003026917804
+        },
+        "arc_easy": {
+            "acc": 0.5841750841750841,
+            "acc_stderr": 0.010113348244647866,
+            "acc_norm": 0.5778619528619529,
+            "acc_norm_stderr": 0.010134620524592268
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.01283552390947384,
+            "acc_norm": 0.30887372013651876,
+            "acc_norm_stderr": 0.013501770929344003
+        },
+        "sciq": {
+            "acc": 0.887,
+            "acc_stderr": 0.010016552866696844,
+            "acc_norm": 0.884,
+            "acc_norm_stderr": 0.010131468138756993
+        },
+        "piqa": {
+            "acc": 0.7236126224156693,
+            "acc_stderr": 0.010434162388275619,
+            "acc_norm": 0.7279651795429815,
+            "acc_norm_stderr": 0.01038276378624739
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1271b27daa73bd9d04bc2b72c3a8ee83da9ec5d
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.356,
+            "acc_stderr": 0.015149042659306625
+        },
+        "anli_r2": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r3": {
+            "acc": 0.3416666666666667,
+            "acc_stderr": 0.013696658778002514
+        },
+        "cb": {
+            "acc": 0.32142857142857145,
+            "acc_stderr": 0.06297362289056341,
+            "f1": 0.24382716049382716
+        },
+        "copa": {
+            "acc": 0.76,
+            "acc_stderr": 0.04292346959909282
+        },
+        "hellaswag": {
+            "acc": 0.40380402310296754,
+            "acc_stderr": 0.004896563126116815,
+            "acc_norm": 0.522903804023103,
+            "acc_norm_stderr": 0.004984543540932338
+        },
+        "rte": {
+            "acc": 0.555956678700361,
+            "acc_stderr": 0.029907396333795994
+        },
+        "winogrande": {
+            "acc": 0.5469613259668509,
+            "acc_stderr": 0.0139903666321481
+        },
+        "storycloze_2016": {
+            "acc": 0.6691608765366115,
+            "acc_stderr": 0.010880601338204657
+        },
+        "boolq": {
+            "acc": 0.5321100917431193,
+            "acc_stderr": 0.008727003026917804
+        },
+        "arc_easy": {
+            "acc": 0.5841750841750841,
+            "acc_stderr": 0.010113348244647866,
+            "acc_norm": 0.5778619528619529,
+            "acc_norm_stderr": 0.010134620524592268
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.01283552390947384,
+            "acc_norm": 0.30887372013651876,
+            "acc_norm_stderr": 0.013501770929344003
+        },
+        "sciq": {
+            "acc": 0.887,
+            "acc_stderr": 0.010016552866696844,
+            "acc_norm": 0.884,
+            "acc_norm_stderr": 0.010131468138756993
+        },
+        "piqa": {
+            "acc": 0.7236126224156693,
+            "acc_stderr": 0.010434162388275619,
+            "acc_norm": 0.7279651795429815,
+            "acc_norm_stderr": 0.01038276378624739
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_2.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..85a7b19f5a3f5bf2b52e47a05fa6602bc37e9ea1
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_2.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620347
+        },
+        "anli_r2": {
+            "acc": 0.335,
+            "acc_stderr": 0.014933117490932573
+        },
+        "anli_r3": {
+            "acc": 0.31916666666666665,
+            "acc_stderr": 0.013462309712005136
+        },
+        "cb": {
+            "acc": 0.32142857142857145,
+            "acc_stderr": 0.06297362289056341,
+            "f1": 0.24941724941724944
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.40748854809798846,
+            "acc_stderr": 0.004903628887264533,
+            "acc_norm": 0.5238000398327026,
+            "acc_norm_stderr": 0.004984125363319072
+        },
+        "rte": {
+            "acc": 0.5090252707581228,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5493291239147593,
+            "acc_stderr": 0.01398392886904024
+        },
+        "storycloze_2016": {
+            "acc": 0.6622127204703367,
+            "acc_stderr": 0.010937034991003881
+        },
+        "boolq": {
+            "acc": 0.5318042813455658,
+            "acc_stderr": 0.008727345583419182
+        },
+        "arc_easy": {
+            "acc": 0.6031144781144782,
+            "acc_stderr": 0.010039236800583199,
+            "acc_norm": 0.5862794612794613,
+            "acc_norm_stderr": 0.010105878530238133
+        },
+        "arc_challenge": {
+            "acc": 0.2645051194539249,
+            "acc_stderr": 0.012889272949313366,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.01338502163731357
+        },
+        "sciq": {
+            "acc": 0.895,
+            "acc_stderr": 0.00969892102602495,
+            "acc_norm": 0.901,
+            "acc_norm_stderr": 0.009449248027662746
+        },
+        "piqa": {
+            "acc": 0.7268770402611534,
+            "acc_stderr": 0.010395730264453265,
+            "acc_norm": 0.720892274211099,
+            "acc_norm_stderr": 0.010465657948498228
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..85a7b19f5a3f5bf2b52e47a05fa6602bc37e9ea1
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620347
+        },
+        "anli_r2": {
+            "acc": 0.335,
+            "acc_stderr": 0.014933117490932573
+        },
+        "anli_r3": {
+            "acc": 0.31916666666666665,
+            "acc_stderr": 0.013462309712005136
+        },
+        "cb": {
+            "acc": 0.32142857142857145,
+            "acc_stderr": 0.06297362289056341,
+            "f1": 0.24941724941724944
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.40748854809798846,
+            "acc_stderr": 0.004903628887264533,
+            "acc_norm": 0.5238000398327026,
+            "acc_norm_stderr": 0.004984125363319072
+        },
+        "rte": {
+            "acc": 0.5090252707581228,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5493291239147593,
+            "acc_stderr": 0.01398392886904024
+        },
+        "storycloze_2016": {
+            "acc": 0.6622127204703367,
+            "acc_stderr": 0.010937034991003881
+        },
+        "boolq": {
+            "acc": 0.5318042813455658,
+            "acc_stderr": 0.008727345583419182
+        },
+        "arc_easy": {
+            "acc": 0.6031144781144782,
+            "acc_stderr": 0.010039236800583199,
+            "acc_norm": 0.5862794612794613,
+            "acc_norm_stderr": 0.010105878530238133
+        },
+        "arc_challenge": {
+            "acc": 0.2645051194539249,
+            "acc_stderr": 0.012889272949313366,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.01338502163731357
+        },
+        "sciq": {
+            "acc": 0.895,
+            "acc_stderr": 0.00969892102602495,
+            "acc_norm": 0.901,
+            "acc_norm_stderr": 0.009449248027662746
+        },
+        "piqa": {
+            "acc": 0.7268770402611534,
+            "acc_stderr": 0.010395730264453265,
+            "acc_norm": 0.720892274211099,
+            "acc_norm_stderr": 0.010465657948498228
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_3.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6659526212e28ef78bf54209ba5a048d40499f44
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_3.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.329,
+            "acc_stderr": 0.014865395385928359
+        },
+        "anli_r2": {
+            "acc": 0.352,
+            "acc_stderr": 0.01511040450564867
+        },
+        "anli_r3": {
+            "acc": 0.3433333333333333,
+            "acc_stderr": 0.01371263383046586
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.33124459353967556
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4063931487751444,
+            "acc_stderr": 0.004901558132335524,
+            "acc_norm": 0.5276837283409679,
+            "acc_norm_stderr": 0.004982127315605216
+        },
+        "rte": {
+            "acc": 0.5415162454873647,
+            "acc_stderr": 0.029992535385373314
+        },
+        "winogrande": {
+            "acc": 0.5540647198105761,
+            "acc_stderr": 0.01397009348233069
+        },
+        "storycloze_2016": {
+            "acc": 0.6718332442544094,
+            "acc_stderr": 0.010858184920580582
+        },
+        "boolq": {
+            "acc": 0.518960244648318,
+            "acc_stderr": 0.008738765179491938
+        },
+        "arc_easy": {
+            "acc": 0.5989057239057239,
+            "acc_stderr": 0.010057051106534374,
+            "acc_norm": 0.5946969696969697,
+            "acc_norm_stderr": 0.010074093589739203
+        },
+        "arc_challenge": {
+            "acc": 0.2636518771331058,
+            "acc_stderr": 0.012875929151297049,
+            "acc_norm": 0.2977815699658703,
+            "acc_norm_stderr": 0.013363080107244487
+        },
+        "sciq": {
+            "acc": 0.901,
+            "acc_stderr": 0.00944924802766277,
+            "acc_norm": 0.894,
+            "acc_norm_stderr": 0.009739551265785134
+        },
+        "piqa": {
+            "acc": 0.7241566920565833,
+            "acc_stderr": 0.010427805502729114,
+            "acc_norm": 0.720348204570185,
+            "acc_norm_stderr": 0.010471899530306555
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..6659526212e28ef78bf54209ba5a048d40499f44
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.329,
+            "acc_stderr": 0.014865395385928359
+        },
+        "anli_r2": {
+            "acc": 0.352,
+            "acc_stderr": 0.01511040450564867
+        },
+        "anli_r3": {
+            "acc": 0.3433333333333333,
+            "acc_stderr": 0.01371263383046586
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.33124459353967556
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4063931487751444,
+            "acc_stderr": 0.004901558132335524,
+            "acc_norm": 0.5276837283409679,
+            "acc_norm_stderr": 0.004982127315605216
+        },
+        "rte": {
+            "acc": 0.5415162454873647,
+            "acc_stderr": 0.029992535385373314
+        },
+        "winogrande": {
+            "acc": 0.5540647198105761,
+            "acc_stderr": 0.01397009348233069
+        },
+        "storycloze_2016": {
+            "acc": 0.6718332442544094,
+            "acc_stderr": 0.010858184920580582
+        },
+        "boolq": {
+            "acc": 0.518960244648318,
+            "acc_stderr": 0.008738765179491938
+        },
+        "arc_easy": {
+            "acc": 0.5989057239057239,
+            "acc_stderr": 0.010057051106534374,
+            "acc_norm": 0.5946969696969697,
+            "acc_norm_stderr": 0.010074093589739203
+        },
+        "arc_challenge": {
+            "acc": 0.2636518771331058,
+            "acc_stderr": 0.012875929151297049,
+            "acc_norm": 0.2977815699658703,
+            "acc_norm_stderr": 0.013363080107244487
+        },
+        "sciq": {
+            "acc": 0.901,
+            "acc_stderr": 0.00944924802766277,
+            "acc_norm": 0.894,
+            "acc_norm_stderr": 0.009739551265785134
+        },
+        "piqa": {
+            "acc": 0.7241566920565833,
+            "acc_stderr": 0.010427805502729114,
+            "acc_norm": 0.720348204570185,
+            "acc_norm_stderr": 0.010471899530306555
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_4.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b9f110696b4e2476eefe2cfd2494139a32524ee
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_4.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.32,
+            "acc_stderr": 0.014758652303574883
+        },
+        "anli_r2": {
+            "acc": 0.32,
+            "acc_stderr": 0.014758652303574872
+        },
+        "anli_r3": {
+            "acc": 0.335,
+            "acc_stderr": 0.013630871843821476
+        },
+        "cb": {
+            "acc": 0.25,
+            "acc_stderr": 0.058387420812114225,
+            "f1": 0.21497326203208558
+        },
+        "copa": {
+            "acc": 0.69,
+            "acc_stderr": 0.04648231987117316
+        },
+        "hellaswag": {
+            "acc": 0.4060944035052778,
+            "acc_stderr": 0.004900988997414223,
+            "acc_norm": 0.5269866560446126,
+            "acc_norm_stderr": 0.004982508198584269
+        },
+        "rte": {
+            "acc": 0.4620938628158845,
+            "acc_stderr": 0.030009848912529117
+        },
+        "winogrande": {
+            "acc": 0.5398579321231255,
+            "acc_stderr": 0.014007765428365165
+        },
+        "storycloze_2016": {
+            "acc": 0.6702298236237306,
+            "acc_stderr": 0.010871682471395135
+        },
+        "boolq": {
+            "acc": 0.5217125382262997,
+            "acc_stderr": 0.008736805647519948
+        },
+        "arc_easy": {
+            "acc": 0.6022727272727273,
+            "acc_stderr": 0.010042861602178058,
+            "acc_norm": 0.5934343434343434,
+            "acc_norm_stderr": 0.010079056419223525
+        },
+        "arc_challenge": {
+            "acc": 0.2721843003412969,
+            "acc_stderr": 0.013006600406423709,
+            "acc_norm": 0.2986348122866894,
+            "acc_norm_stderr": 0.013374078615068747
+        },
+        "sciq": {
+            "acc": 0.909,
+            "acc_stderr": 0.009099549538400246,
+            "acc_norm": 0.915,
+            "acc_norm_stderr": 0.008823426366942305
+        },
+        "piqa": {
+            "acc": 0.7230685527747551,
+            "acc_stderr": 0.010440499969334523,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.010434162388275598
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b9f110696b4e2476eefe2cfd2494139a32524ee
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.32,
+            "acc_stderr": 0.014758652303574883
+        },
+        "anli_r2": {
+            "acc": 0.32,
+            "acc_stderr": 0.014758652303574872
+        },
+        "anli_r3": {
+            "acc": 0.335,
+            "acc_stderr": 0.013630871843821476
+        },
+        "cb": {
+            "acc": 0.25,
+            "acc_stderr": 0.058387420812114225,
+            "f1": 0.21497326203208558
+        },
+        "copa": {
+            "acc": 0.69,
+            "acc_stderr": 0.04648231987117316
+        },
+        "hellaswag": {
+            "acc": 0.4060944035052778,
+            "acc_stderr": 0.004900988997414223,
+            "acc_norm": 0.5269866560446126,
+            "acc_norm_stderr": 0.004982508198584269
+        },
+        "rte": {
+            "acc": 0.4620938628158845,
+            "acc_stderr": 0.030009848912529117
+        },
+        "winogrande": {
+            "acc": 0.5398579321231255,
+            "acc_stderr": 0.014007765428365165
+        },
+        "storycloze_2016": {
+            "acc": 0.6702298236237306,
+            "acc_stderr": 0.010871682471395135
+        },
+        "boolq": {
+            "acc": 0.5217125382262997,
+            "acc_stderr": 0.008736805647519948
+        },
+        "arc_easy": {
+            "acc": 0.6022727272727273,
+            "acc_stderr": 0.010042861602178058,
+            "acc_norm": 0.5934343434343434,
+            "acc_norm_stderr": 0.010079056419223525
+        },
+        "arc_challenge": {
+            "acc": 0.2721843003412969,
+            "acc_stderr": 0.013006600406423709,
+            "acc_norm": 0.2986348122866894,
+            "acc_norm_stderr": 0.013374078615068747
+        },
+        "sciq": {
+            "acc": 0.909,
+            "acc_stderr": 0.009099549538400246,
+            "acc_norm": 0.915,
+            "acc_norm_stderr": 0.008823426366942305
+        },
+        "piqa": {
+            "acc": 0.7230685527747551,
+            "acc_stderr": 0.010440499969334523,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.010434162388275598
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_5.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..09d78a14bf53a3b852dad17a0249926e7703dfcf
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_5.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.346,
+            "acc_stderr": 0.015050266127564445
+        },
+        "anli_r2": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095526
+        },
+        "anli_r3": {
+            "acc": 0.32,
+            "acc_stderr": 0.013471620929769149
+        },
+        "cb": {
+            "acc": 0.26785714285714285,
+            "acc_stderr": 0.05971290310957635,
+            "f1": 0.23228120516499284
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4085839474208325,
+            "acc_stderr": 0.004905674408614017,
+            "acc_norm": 0.5306711810396335,
+            "acc_norm_stderr": 0.004980384575535375
+        },
+        "rte": {
+            "acc": 0.5342960288808665,
+            "acc_stderr": 0.030025579819366422
+        },
+        "winogrande": {
+            "acc": 0.5398579321231255,
+            "acc_stderr": 0.014007765428365165
+        },
+        "storycloze_2016": {
+            "acc": 0.6686264029930519,
+            "acc_stderr": 0.010885036980220164
+        },
+        "boolq": {
+            "acc": 0.5107033639143731,
+            "acc_stderr": 0.008743051044836891
+        },
+        "arc_easy": {
+            "acc": 0.6085858585858586,
+            "acc_stderr": 0.010014917532627819,
+            "acc_norm": 0.601010101010101,
+            "acc_norm_stderr": 0.010048240683798745
+        },
+        "arc_challenge": {
+            "acc": 0.27986348122866894,
+            "acc_stderr": 0.013119040897725922,
+            "acc_norm": 0.30887372013651876,
+            "acc_norm_stderr": 0.013501770929344003
+        },
+        "sciq": {
+            "acc": 0.911,
+            "acc_stderr": 0.009008893392651516,
+            "acc_norm": 0.912,
+            "acc_norm_stderr": 0.00896305396259208
+        },
+        "piqa": {
+            "acc": 0.720348204570185,
+            "acc_stderr": 0.01047189953030656,
+            "acc_norm": 0.720348204570185,
+            "acc_norm_stderr": 0.010471899530306559
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/evaluation/rankeval/4b284b17boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..09d78a14bf53a3b852dad17a0249926e7703dfcf
--- /dev/null
+++ b/4b284b17boscar/evaluation/rankeval/4b284b17boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.346,
+            "acc_stderr": 0.015050266127564445
+        },
+        "anli_r2": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095526
+        },
+        "anli_r3": {
+            "acc": 0.32,
+            "acc_stderr": 0.013471620929769149
+        },
+        "cb": {
+            "acc": 0.26785714285714285,
+            "acc_stderr": 0.05971290310957635,
+            "f1": 0.23228120516499284
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4085839474208325,
+            "acc_stderr": 0.004905674408614017,
+            "acc_norm": 0.5306711810396335,
+            "acc_norm_stderr": 0.004980384575535375
+        },
+        "rte": {
+            "acc": 0.5342960288808665,
+            "acc_stderr": 0.030025579819366422
+        },
+        "winogrande": {
+            "acc": 0.5398579321231255,
+            "acc_stderr": 0.014007765428365165
+        },
+        "storycloze_2016": {
+            "acc": 0.6686264029930519,
+            "acc_stderr": 0.010885036980220164
+        },
+        "boolq": {
+            "acc": 0.5107033639143731,
+            "acc_stderr": 0.008743051044836891
+        },
+        "arc_easy": {
+            "acc": 0.6085858585858586,
+            "acc_stderr": 0.010014917532627819,
+            "acc_norm": 0.601010101010101,
+            "acc_norm_stderr": 0.010048240683798745
+        },
+        "arc_challenge": {
+            "acc": 0.27986348122866894,
+            "acc_stderr": 0.013119040897725922,
+            "acc_norm": 0.30887372013651876,
+            "acc_norm_stderr": 0.013501770929344003
+        },
+        "sciq": {
+            "acc": 0.911,
+            "acc_stderr": 0.009008893392651516,
+            "acc_norm": 0.912,
+            "acc_norm_stderr": 0.00896305396259208
+        },
+        "piqa": {
+            "acc": 0.720348204570185,
+            "acc_stderr": 0.01047189953030656,
+            "acc_norm": 0.720348204570185,
+            "acc_norm_stderr": 0.010471899530306559
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..904e6b5bdfa0bf00e49195038f48d284a1b60a54
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c451cc375539ef38173e9bae1c7dd105c7e5bbe9b595089a860412f6270fe930
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..138f33d6a86e98da3ab4f2ce84937e76a295fc3e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b826d735eb843defd755ed31c8672b28513c2cc9b54141d3d4f4a8af665cbea7
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..113d42219459c9c062cd46de0c1ba45abf27fedd
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2d5f5bf2ab37967d06b67d440f6dbf7d78993eeebf089d30219851450b77cc4
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef5cd4a8fae7a790de005f9aec74595cb239fe87
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e48bee4c2dc7c61ff58d3e44d330df9f0d3d18e72e476259205e53e85dd10d4
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3fe8e0095ead21eb8365687c61d85be900e3168d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01588a6ad5217552ff00c60f9bf94b77ce9ce89ee3ddf8b6b4be1a7085f2e070
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..97d0d7b07a1aca1d69e4957e016f66caebb4cb37
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38f572d30d12629063ba739e12f038945d2a98f741314c0cd9760832363147cc
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c05b524dccc9b9021a59cfea68cef50183ecd069
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d243d34e12e503b544ef9ed7f9978b3b498717ec79b46ef84111506a37785ae7
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5e6cd59bd4f53acd9de4bec531e7ba464a465d7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5569c4b3e1201255aa325b5f9bfdcccc963d868372043bdad7e3272f9d79c1c9
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..da87c3390b839ee8c2aacad856298c78d2d787e9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba19b3b11dda86ee2c99c9632dc27139c9da891af82ca44c5b7b6669a26ea77f
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c8174580645d0ddeab6138f73102b39cec25eace
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab2cbf4a1966bc5abe9e600b4dc23d9d2347ef8a3041c727772452aa741761fc
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e99a50d8a717e3ab61c44179ad2bde89e039df9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1b2fd36ca4dd4fb7c2f124178439fbe9e9dddea0f2777404491ffe6b854597f
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6520a298fc6823686d82e94e8d01cbc145463cf
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:869da75c112553287f8aeccf89cc59be8bdb7b9c7a57a53842a743350f10d221
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ad453a1689f7cc2a6b6e98b590b42ecdd3bcecaf
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:564025446f400fa06a80262443d31f25562176914c886fa1ce8ded8f6b13193e
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bfbd6dacc8c4897e263375561259b5a614f7ce42
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ea10db86c42ce58e0cb1e57ea16593d090b95ae15325a60aa34437c11010076
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af7d0af487de6ca1bdd4dfb5243e14a25374ef13
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c36212b9d59b8a64d1128d3fd1d08bbe52219aa1dea34c12268bcfbcff191ccb
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a61292482bdf8a0fb873c4c504e0b581f732b02b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3c469a9977acd6f3631c81b65dfba2354f1db4ac278c2b60b94ede75f07672c
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..391b5a8d2b48e951a01fe2628cedf70b2ecce845
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbd917f94e5a3145ad382a375a472fcc90409ba4dc0d671b8568a13b1461a49b
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a386432f383b3a38cf70706be94469a53c7169cc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:506f3a92d9b6c2abae260179857fa1f5ee518617fe497c0b2b2c55faf01874df
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df9156a0a3eb5a0db95bab2b9afda0298a9eee4d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8161521548bed21d07261390af2004cea21d75af688e1cb64afff22bbae5df9
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..782eca2beae282d7c4be8cf905d7b8af29b9a0e1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:669bc5ff4117f308f79c24e1682d0d5150f8f0e094f90f732859728f45cafeaf
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..be842e2611467986eb0a4705f1f77ac4fa3691e9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8a49ad425fb51057c89e5c54139a0f81e6439bdf8ac9bcfc0146d4453006432
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b81b5815acf55805630a265e1ee8644b7ff1ee7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b538ab319632aeaa51b5f813c51e19d67350aefebafa16d81deeac91b4ba175f
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3514ad418e922111f7397fc2eebd372eeca729b5
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:012b4c09509874e9a95cd9aff7808bbf5c95029c7527624d00a39e7545e41a71
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fd5bbfe78bcd46ff4cfffbfa2754ec5278e44246
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c4a2b03324ab1b3c00b2a2b35847f38df2229bf7edb7296c3e9afc4f9844aa4
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bdda6cf4764d84dc593ed6819fac2a1cfc7f3f63
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd01d3b1f5e2203f246a0592c345785f4bb1e15e84cb036ce78a58c1f84d585
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dcfdb52283d8cf7f0f2ff92426a305b26200a61f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8ab25620cd3747465393e718f6cdb8bd3e37084146d3977066509375ca37db7
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..51e283fdd4e5424b997a13897cb519cb936e2ec8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ad2f58ef07523665ffa0bf95cd90a8563d761659912af3e50566e2584edcb3b
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81e17ab2db09d62cff4632e2677c139ac6d44579
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07c6a5fa61da3cc1575b6fbeaeee77a589ea46eb1892c9636e53e41a4fa931c5
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..224bd4ef288cf2fec09cd988885c98cd56b7f383
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73a2862bbb17be6b4ce171fbf3ce567841a94e7e2106e3309f617789956e6055
+size 199058605
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10481c5e795553b0a58638b4ff27b11890269081
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bf743bd5ea474625547f08c1b3fb8d9efbe2f3467326856aaa6d7c1672373a0
+size 199058605
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1a16e0a4116f1a9c9ecbeadf4063705f39a855d4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf318a2ce2882a6be25f5909b89703fffb80f46e5a7cca00f17a02da15361e10
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3fae4e48fe03effa0b7cf5268ffe65c9e93d396b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cce09cdd73c0ed7faf0f02d5f755c19b038e30a533908df23695116290b6a88
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..196358736340834fc146ffbcfbbb599231fccf00
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79d5e9aeb5933d3c1312a50c3d17ee80def6cf56ae2f1459c4c7742d7f6bbbff
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0767d0003286f09016822a5f7bd6467436198375
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b45d3252c6517f75200d4813ff4c850cae70d352f68ed8edf1d18c643eb4bb2
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a3e4cce9af7791dca2be4c5bf1bfed7a1fc1e175
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fc13516bb8b85cc2468436a9bed61af48d58ad5bc9a1d4a9a3ba67dd83d2675
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1d7ad2e5d18473fa62be99d1d068bf56f8923e5b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98928470816171e66eed4dec9d9845dc3acf75deda380c1c5a495cb8c82438e6
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8629cc080578127d6fc7d68a5253abe98473f2fa
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bacefbe878983bfa936900c8d04b44b79846693e25eb47688c381cff1935e1
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1c18a9cf782c34d200c8996d032da5281fc41009
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa94d5c702091fd73b77e910988cec88533005e97142eb05c1b638bf3dc1eda
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c37f2172d423b1ee57f9a41fa24f023837a049b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce7fa89dd5923a5014e14b28ddb5c76002a5c9d0f51c6ba241edf545c074166f
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1e08e2a29c93d3e8a09656f0b7e6522481d8fe5
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f17ab60d7e9112f4e4e24127ef44b265630206f4bb1681fe33994ba7b8ab2f6d
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ad1d79a83c43998158c9d947b876daa795778a55
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea20127f372d111599f7f9494b961bc1c37803d8fd162ce16ac4dfad86bd1cd0
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e7412951af9c90dc5a51d8a38630a7f51cbdb82
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8b8d252221cb1aae5164282d4c8728ab86b8f742cef644e0080d5e8fb318eed
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..38cd7a08038a286d792f9367c0e0acb9443b1016
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d365d7c52aee94a6b695d0acf047599bcf053ed9cf68c001c0c1d9c8d8e7be
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb8c35a6179704c9e7dca332b3245cc3eaa39229
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e3cfdb5829085635068394cdb17d896f53ef7ccc1e3086eb869a03fc7bcd508
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbbb129e3ecce70a2bca0f2009346392a40fa451
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7de9c29a09a8afa3bbdfa3ab888d96c7f96201f5336f3c0e687467ee696fe402
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92cf79ee359bb441ed896ed3fa8022e8d6761571
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fae8b2fcfee895a065e483b28ce809a7bf78e9c7b0b8f351af82314d4521804a
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9f45c52ac3a44a3f9a59f4e5305ccd0e5c5a272f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1ffed8c7885f7b3446457ac67ed7019c5dd186173d60342f3edea38f6d042ea
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c9b6ff12e6e73f8d5a4ca98a72f29608985aac8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0470c90b2ebe1617b622f9a554348cf5cd7c3c9562d2e8cc12fc9e7b033af9e6
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10b248f758c6d76ec0623c2057656aca90119988
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:223684b09ef54bcadab430193d19109b35f46504329ccaf8ea3ca8d56a058719
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fc2b66cefbe5075ab52a1865379470abc4d17d63
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b2feff938433ded2936957c0f148c20faf4b5a9a7b61763109b56f4f3b4409a
+size 199058797
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..303be6277e6e22d1dc39690ce27bf6b92e34f075
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d729e22744d5edbd5e73a55c495e66978f9989838a526ae1a5ddde6941da265e
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c956d3d600a64c6b3c67e22f0e52bc684926a40e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdcb223ae754d6ca46dd181de0d128968fd77b96a54da27bd34dbd92bab00f5e
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e128491cb083be6c8dcb1dcd2cfa220176a2f5d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a21951833ddb2287b69ffd41991e95ebb9f6c9b4879eab62af83071f73477291
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ba915c36f37571e7adb792309515c5f651ceff5e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:613a49c6e655140ff4b3c677be7d65a3450662457b42698748c7a338aa8994b0
+size 199058733
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f9247a9c28b63e57f3c6a3c8a88d3f29e3cae426
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:946f698e79eb7b833c892e7cef034c55f42a98937bd03b7febfee4a2a9c29712
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ac46a42abc1bd10e8a9af6a3de9d7b284a4b62
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18cd6affb1d2c476b04b1e31b28b67b422df76d29b6329034eda7e233aa3fa21
+size 199058669
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5ce773b8646c97cc6c4cf12afd286c586c85f46
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef50db272d86a3b4a2f85e64c671016ec986e13496ba56963f6d679844802acf
+size 199058925
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9978ff245051f482de094b25e54b04a2685bb99
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:902afe1fcd00c5b8fc719b99d757b06bde310b91700e8cd7cf679ea803485923
+size 199058925
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1aac0e43d37c18fab607ea9b881930616cbf22e6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a7619e43b6d7f80c8f11b82188f178e6e6efdadbcdb83350d25e6657661da0d
+size 199058605
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6e020c6bba3a91a4af90eb262a42d68ba8b31cd2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee48072a705f1240c3f59773e8a3fc0576ea9fc370f3b30ed6673a2989449c0f
+size 199058605
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49d953dbe8102c4938725061a08a1c548bc17c0a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fa5545c753e02d04e12df104f44cf0674e8a9d2c9359c54785d00c224e9cad5
+size 199058605
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5c37271846fe06d0c66915f6c32131c0fb0253c5
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68b3e97123c21e8ac0ff20d2fb40cd004935e18a76911a4ccb03502c3d2a964b
+size 199058605
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ec15a9a9a992cb912943795a675092c39198cd0b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7286197fb2f349b6db7ca47596babbc0522ca8579ac112ea92f46d2aa0ba008
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b2388ca56ed6d0a9213f03673e293e27240f919
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05ac57c99f7a95d6af4001fbdc23ff88f28795f89d5f02e115557ab96dbf1db9
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf3adc002e30873b3bdea93c4b836f40a74ad009
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82288364468462e64a9e58f11785fa78aa2cb9f45b9b3eb8fe160dbf77db1b43
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcc49df0623ac9e87a25983b4714cbf7749f8f06
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c555f9ee21c58c67c626859e3ada2f91b3d6c23691a46df5b14ef3cd43dcde8
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3155122f39290a16c9301b6d28a3490488d6d97f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3c1ec69e6f5ef0e4aa6b6d448642ecb4df7ba11233f53c3b82814dd9cbe47bc
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0cc7f2d33ae2abdf3c72eff98f0b6367216e6aa6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d5b59b15e414e613bb5dcba3446f5486ece727c257e779a351a7fc2124993cc
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5a475c89d1f66c3dfe0f3b0d57af5cd24ed3645
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:152c0adc149aeabf5b14c020d08efe365c9c985db6350b33d43eadc856fc45c0
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..24a9e223be12821daeea8eb180ba9ecb664fe3fb
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81d8d1179bb586fa93071bab91a22f6a4b6754b6b64bc9330fa3552c83544556
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b22f6d661a8d028f38952d4d99ac9d06b6dc3b7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a420f109a9d2c91fe18694578a49e574190a87c754c54f872b1e16bd692bae2
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c3afd266ff258f57ea9195d3c8f5c034c961bd2c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f086ea148fe0056ceff70895079e09a36d7560805c0ffc71be87cca49cd47862
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df4b0ba6ceee15ac99a0358891202755a00e84d9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:118fd0b5f7a2a49409b949eb2d005cf5b0b7ba7a01a6340ed18df68039a5f65b
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..473b27be1028db9948a6e43c9a0dd883ce795bd3
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e9fb4f735556d4da39c118bfdcfc2a0e0feb3d433945e9d1d384d9ff970a2be
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b7e654e8b0a98319b049ee2bfa0cbb3c4f569e0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04fd9e7a619cab21d61eb7a8e43cd07f3473a3928fba4189fff33d8d344a501c
+size 199058978
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..82bd3c4dbb4189cd4d45f9b76ab5edc2a82c00f9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a50436251cda2c15e2cdda034f852c5a1167f7340978fddb3ab5f362a6c9b6db
+size 199058978
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b1a5637cd1adfdbbb27b7ec50b2cbc7eabfb9e9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9fbb7d4f9a8600de4cbce5517aeb2ac90e92bec47a82b726de745f3d6a9a0c1
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..782b571ff9a776e19c4b2b6ded62294422f00f10
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0edb01bf0a34fd4f0dcb1944b0674d09e842a2650b3e53ae84841e4d77b80c60
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b75f1510a139f8779b7ac73489e90eeafa192ab2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:675ae2102cb7420341e83530286f81bee8b996d04beb044696b46b7b7d9c3f0d
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cbe0e32a8e0b0f39c9dabb318b57a5f4004422f1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a0bc970fac299f468ef52e580ab4f5932c6f7476ca5fef8a702d92c9b494d6c
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f7b16d97e18a45482d1bf5705d4312bb01066d48
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75e920e3a0f6ca3448e0a47baff08ea85782607284dd1988d935ff4f47900a71
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dad6b4536e246efe8aaabe7703539fe9a3ffc52b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b93b31c300f48272447ef4cf289b0017871bb81c105ee974dc1d5a7877d127f5
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8ca6599c85aaa1adf382e3d8d2c0311c2a84bd7e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1501632c87e6038e0f67c27374a4b50f7ae061651691478392bf5ac91386c849
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2de929f6ab1ab8e8bbcdc696fcc89e1bbff7355
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a767e689e94459e9c30788fe2f8744d242c6060fa3ae90e68d657b41089f11db
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2d1862d90c476cc17274ecfc8b8e7593a02da59
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acddf212d64c1dbb1ee6be025412bf1b4510a00b37a040f48a9cbd71ac06eb2b
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..32178e70dcea5979c31b54366d6c7df40ae541fb
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78d0253882a8ade4fd5ae49120ee6559f7f93a21ce7bb778229a2622a84ade7f
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..238ffb46c2617a2e9d903eac258d96345d252fe7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8bc13eefd8f12db0f17b1ec736b01b019af0e4c223800d7d82bdc47cd4cba1e
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00a9eafadf3533683e74cfb0bd007e65a3573e6f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48f8aa306c25a518e0d4677a7535cf7414823c42af2cf0dd3477b8bd956348f8
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d032ec2f7770928df25b178b653c91c1e89fe01e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbfa151f6712b1bcaf5b554cc673bba4e136165e4e0127fc9dae5f45644e3eba
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9abaedd52a3981f95bf166ab42badb2cb5e03445
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54ee72aefc24c1fabd17da04a584eea288dfda60e1c10a882675e49b15544e1d
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8f5ba1731103d16028a340dfcce612db3688fa4f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b6b116302cc6ee87bb8a39fb18c93da7c01a3bf3f4c1a76505e2b9d3b4dc9bc
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4179a4e9007da259bc847610b2f1537a0a76fd3e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb963514c7268e0bbe9cd4ab4ba27327c4bff622556b81b47bb4bb3ea2e989a2
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..151bb8a96c1ec4288a6f8e9e53a2a37f1a5b5706
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19838293f5ac44d0e0d9bc46b770aee86e1150ab7e4df9b8b8f2c541b6bb43e2
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bad31da2a7bb46ae5a4ee041902b90cc141eca92
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:007b66fc1949a103747af2b5b79369ff5d4ea370ff458eb97560ee630c721a65
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6c548feed2a829ba0a41d0dc9b8a7687112709b0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c3ffdc3314a8db40e92181eceef8ad47130419a1ea81148a4e4fbfb0b44704f
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c46b5ebedb26d9594319ea218f2bda662f587075
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a14eedae368d2bd5f60072cbfc7c0bee959023ba044acbfbe38517634d45a67
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..96fd705a06154b33c5be11a43dc656adfc1b1885
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc7bb932bd68614cee557e705e36b35498f5601b2a6313afd55daa23cdd4d508
+size 199058594
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9780c8ec6f1365f149d5b810524be4d20ecf7c3
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee95a33b3dc6cb835e77d8310f111d5a8f726cda78daae61dcf8c956c7f136f6
+size 199058594
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77225c2e21aaa813a4c06704488b7a07b6b8a0e3
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cfc5dbdcd2e2a05d26e43043dad3fc9f7a2619cfa59baa2bf845b4cb9433aeb
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c9408ccffb872da2aa11086e1d362faaa755aa87
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e0f0580ba9d4ebfc2deb3bf722eb88fdfe31102985114023b79fb15da35d996
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e3f7c86bafb6edf83a434a172a16857a2d204321
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eefac9ebc359a24d4332b5dfd42540867e3e146290a7a121e4eb9b833a3f394c
+size 199058711
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4157f6645120d5d4379bffc6022e0638f9bd93ac
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55a02e197634f34aa56d6c3b609dcf8d634529c5746903c17d8e8dbb5b0e8c74
+size 199058711
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a698ea0b72cb74f0b418cf6df9c9f7560cbe38a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f711e9d975a61ad6ad5103e4bc2a4eb925eb675dd5c57d5ca0411da6d12b06c7
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5417b105f354f4ea79dfb74aa8efc70866616141
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc9f7b3c3b29ad29f7ccc6e54741954e9ad64addc25f0eff65174d03c26f3a84
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..468d2088c0b9cf95b565c6e68a9579464fec49ff
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da2bf5eaf80cd20828bf0dc1653109e7de77b399b6b88e7f6a57e475a653f5d0
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..05790989a49092e71e417f580152a9c0200b7cd1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d580b0a4b38fd86e8b03073a874e3be6814d923013938fc1e2bb5444d8fcc9e4
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..494c176ed56183b20471aed444b53bce244a563d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d5220fad54869df2bfffb0b259e1e9e19bc4d808f13967a9ee21ad939085188
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d65fa50ae99f828f23ea35a3c89257666797f6ef
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c1501ced57e114a48bcfe5a9cd9879d9008bc167488d71af86988c176ec78a0
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9cf19a04b9534c7f9f983a3241e978efbf5f4533
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84349f859227f97dfc9e5836fcaf4fd3d16e99a8a9b9915e9f48eff728f036b1
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..24d9bd6238862e89a64970fb2fda7b66caa348cc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b28997a249622cbee59b604220865df2aa499af78ca36d3fb26f061228aa3bb
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d3404b307fc62ec87e9ff7f26240dbcd8acf7cbd
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f18a89ed4733582772c3f4540bdff8772a49c4b82e334e644e7a2761011c482
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..13feef47d4e022d355205dc5c0fb83d96b3a4c0f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5d37cb399c137dae52c5ffb9775b0f1aab34a70f4c96206c05760e68888ca9
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..29a2471de239e73971f44656376338b44e296b32
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7195aa6da80ebbfb0c42d65649895001a4dd12365bf5777a55beb6389badb3e
+size 199058594
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8a03829ac5f0fab6e530c76e2cec976f49a11db
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69e9598657870b8626f8b2a9e44a362c4c384df9c9a04eaff1ec99a9dcaf0610
+size 199058594
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f70c82ebbaa8cb46ca0d6bec6be3fd84f8bb3ced
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e27df1fd5426fff7a6766476dae18df0adc1dc0404208a1d6146bdba0d57bea
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9fa82bc55f7e0afb28a21bf420cb7df88468997f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f91306bda2d976eb4166c878adb567a0ee6da0edcefc1b02dd6612efd40c200
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8943a57e87ebf7a11446a54ffd3a638b9534b25f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3e937519049e44f7118a5db8c89e4ad30009fb8042db7e5d2c2de91f90c6c17
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e61559542520b0e015394d7950e87b476c758ea1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39e4fe25133651d5a63b73bc840cfb7f44fb4af86bf221a253b6b03d2572efaf
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6fbfc951a0449afbfe30b34357419ccbc9cc53b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf3bbb7f27b3f247c37a435588470ed9cb413a1612488a99c693212f58241e22
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac7d77c42d840d7583e9af362f5e2cae6a64f01d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9253e8d64d0f5a406cce7f833f76819cbf970c23068acd595d2a0aef439c7ce
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..080b5c119bad0c48606e419ee96c78cc47d39871
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1127ab1d98787d81a51abfdad689336dacbf696bfccccfae1d7138576ec553c5
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..830d1187e9ececff19522281b350ed6f11ee0eb0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdcce7f773b6db2bf9372cd1ab46ff33a34fc9e928a5378f561994f92dfb93e6
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..caa661af7b9f4b3b06587695f8b3b01224c9fa28
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64def5cfd3283960e039c8c8d11ec13d25ab77157d90357ae8c44c4a61a2037
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c8f200e8900925c36edb09be37005ea226dea539
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31d8c7bd86d220d9efe7de75af5ffea7deedba7b438eaa362a43942b4b21314b
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b164c5bea3b4fe78a230fc38a8eef1c2ca61c5ec
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01bf860f03be7499b5b27e23a4f9a142978375964dd186a25ca9726b1a5cccb0
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..942062d2a07ee179ec69b0a9b6ad794bdcf5b002
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbc9d1bd2f8fa1a0c5cb31b9d584cbf27ad5544a23de6527bbce11c8229ff9a2
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..75b7fd6476c79bd49e609469633b602f9fd1b202
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2070724f586a21facbd71f665896105ecc689eef2673d3442eae272557ce61
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..61d0ed01b317728751af04cb7fcd74708bfe8ab4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56c81db8cbdc0a8767293757b0718dfc1b15b04edfbce0a7d85a421e41e61cb0
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c8ef5fb2011a32302787085a83432d882cf4def
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85e6a30654c36bf14f92bb891ca82b665674f3e39321968f73d3e6667157978e
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..15fe9aea3111884ebd6d95f1c3de0827ca4a1d30
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:367b1dc82869ba00e850e7a15bc4523558a47aaae32968bed2d03c14e4aa0c58
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..44060c12abb033c63615b28d3854a864d335ff3e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7780c867ef2e941aa4524c36e37119d3ef3b024ba5e8d255feb7396e0f87cc08
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e7357f887fd10443f3ee83a88aec875370da4355
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63db6eb4a95e40b0384b4aefc6baf6c032b9a0f3b5ab9a64f6a7fd0249f1d4e5
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c617c5cc51d82308a4a0b9bd7ac6fe7441cf0edc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8918e065a2cd3e8e8e15c1700d9aed85ae403ca9dbf8c3768fb8af61b83ad68
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a748878b2d4b811402dcc1dbb1794659034887cc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e4f44f0c59bc4ef5e89d6b7f8e9ad7c1cc6da07d244919fd801fc709b59342
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77886f3cca693f8ecab42989931c200f6dc32c3f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3dbb1b98726e23c9e5210983b3a9f9868382a316f415089298c133b64551d8f
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d9a345d3b633c64eb901fa50fd701582872fc929
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6eacd2193c8eb1df2727455f8205112639d659b82817fae18d5b179d5e4d872
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1fc65a441fd8906bca8f5b88d8c588a382904dc0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fb693b6a45c3ee7c935c98d381430fa02af77ffa44ad1c22721b7f8aa23d01f
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1ce11106c25d8d42be472a323a332146dc9c7349
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0ae99cecca641021385cdc8db4e4c2b8fb5dc932f52595175038d31f9918a43
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..61f49946cf07bc7160680210d8698e5502255eef
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae57db4b1d80e54acaeb1ac40b241cfc8d4b4bdb1b987b02c5ddaca06feca187
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f07ef6b2e4f03c47c4cdb87a0f315bf394bde24d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98c44a2822709cef00e2f457e57d2585c06dab31358ddab15e25802d79cc6365
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..535ed6fa9db6f7a54440a2d63ee96321a7e69a71
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5968719e02ea27006f9cea28ad66af1172c54a95152908cdcf8730e3adaab2d1
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..efd893bd4fb9b3730d6b8c54f86e356f0d7f23b0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8f28caedc8f80ba543f9de124828409fe890c2a5ad679fc21c3cf7dcf10bc6
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f859d438d7816a4d820b1b44d05977f30dda75e6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:260119672e6c11c46ea7271f9942f02b82181411c6bc8997e6dd15b5cc8c881f
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77739ed4d455c9e4772f6c460f493413285c902d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4d1b37901039ac5ebc00d5bb74f56ae9d5eb153617df19d2ae41a1c55a00f3
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3de37e5d5a0111fec4d19192bbd1562d7dd4e7e0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71faff23231d85c591934c52a5a7c7522f39ca1d7c274f72584934fd8a59f704
+size 199058775
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a79912dde892ad750391684f8c8e1e734a7b8ec2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50b69a103322d35e91087937eb5421e05c4159e2e806308e76ef655ecc43f1d8
+size 199058775
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2178ecf0118130ab2c191edc26efed25575ec20d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3a6afe20e7603e167377cd6d17015c02d0e61299e70a553056f65880f8cd57e
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c73ffbf172291e25e8a48e0d764e71629ccbe5be
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:581295cee575ea8a4b14f41050c595c2b55bc7c35162a2cf3d7174a6c9195fb1
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f170b6bb84020b9ae41bc8c42f2ad83833e5d6e8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c867372eae378aff92953100117d66ec857f14dae3959b91fd384e311a5b16e7
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5cf545c83f6063eefc271fa246cb6fd16448510
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efc197ed595d90301e85b9ba0595c79b5a5661f6f1eb29c8034f5390975053b4
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2d1bdd31b89ba2d5b7dc425775a8303059bfc755
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d437bab50b510ff0931644441c9559fa4140a35f86258d65e1db1cc4dc5a3712
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b88541630b14feefec97cac7b9b24d125e64c8f3
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20987314e93b867c14be7bc72e830fe7d6324c71e231fcd05880eb94b0b460ee
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5d197bcf459f32f70bdd335aa121e45dff33153f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:738a19f161136fb6636b3fb3d18228243805f77c9f7b5ddeb265cfab5060f822
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..05806e6e66f0ada7e2dc9f27ec4953c6fbcbccf8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35bc7b2f43524350640f5f987bb46662463189dc449e12a56cfe33805abbec7
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f1dcb0a9fc7b300a774f2d2f09cea8fbf2adf3de
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99baa0cc0bf2a011b9bf3d7cb94e426773d4deacc74e31ad8f8fab423a83e29c
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d21bd06f193ea7a58a21b58dc7bffcf89eabf7da
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:476cedbd6df7172335e3037100617c4b9e44b8dc87a7d7948f790b290378b7eb
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..33939f117d3d6a48a44279de9a7551947cf3e6a6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15554d580019917100225c62a97e4a94687f2bdacaf650abd6bd55ba5b127ab2
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88cbf16a5ceb87a8749efe0e9c11786716b76337
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ac452cdf80af19bd580fc6974bca9e3fe5a6b6877fdbd69b4566fe0d17c45c5
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1683e6af9b0273c72eb46f938234fc96b19215a9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53dbcd6273b4855bf352349eb70c3188964e303d8b2b0807e862898fcd0c2f27
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..93d8f7de1d57e0f30455e5ddd426359a09c1b998
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c664ecf4ea3ad446544735ed8062c5cf38b325582474f2b8c62f3d1590630f56
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8b7e03095ae43b8b335139924e3349b1c64d0a95
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c649e34b40656118a80e1181ecfec6baab1297d533047c87ac235f45bf5e187c
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5711caa90759b5813682695dd0796b754c738eed
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:642c125746374ee2807c0c9dc9b305a258e7a5b1b42a981b9171f0aedb559780
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c073e9b63331fefb9e02a1584e6c53116182ce8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34578675ad4f3024d60042f0673f546ad328b261c5735a73a6b352a39b4cda88
+size 199058914
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..33cbc94419ac02e21a56544319539af16765b961
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e45493c3cb1bb93f8a818604cd45a4be36f380c017cc7256f6fcc4d1b8415eb
+size 199058914
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1beea70489a2b488e1070dda65d24da8494703b6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e38286e9cfb5084de35d0d43d0d675818f5eae07fdd17c82e1db4c4061687dd
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4bd66e900e93985838e402c71a34c6af64e0090
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90b49e68419ea15e9fd163d55a8b327626d6896df793cf19b44978557c4020c9
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5c1f8d0b2dd9e757d125903f579bee9d8bef66cf
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:921c176dd1a844e28d868036703c800c8a64388ede4d3af09afc588026e50f89
+size 199058711
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1bdcff23e99a72c568f9f153879f54ab18321f68
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c516fc5f50d305604de9320d5a7c95c085469f2b68e2b7f6531d5cffe3e4abb5
+size 199058711
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bbccc1f6f5b7f053d6ca83380d579650bc6b4c39
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a239f471ff1230753915ae801b80ef15bcc4ece31fbc9e799643e0a0816c85c3
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c571db64fe91641ecbf47ba4898430131a70c52d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66f731f902a50228d099d79cd290c98fec8913ac4bb367e53a05f4d7856a3db5
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd8fa3dde4d0e3d8a09a841104fc41c1c21ec7dc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a40f66e6f4e1f6980b51a3a59631e4d0a77b1ed826310a40485a72a7571017d
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac3a272eb845f7b5a590854c426a1025575d2781
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:968534822161dd189b0c04ff9f4b2e3204e6ede4278a16ef7a8f0c000dabe480
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3eaae87b6f52df838e995a107058fec425ebbd82
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3572029d6946a97ead5470ac8c0c06b9eb1052fcdd14f5c7a68d40bb90f5297c
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f048c65589b6cb71a1a325cc889d7bfad49fcff0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04bf24c34be475a4aca0de53af1406bfca6efefbe371865ba4bda901f221b1f2
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3a36f891950ed7fb641bb853251831486a3affc7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ede4196a7a0ab50dfa49b1c56e35974fd2e75396479af0ef57ae1cb6e20afbdb
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..48803d5dc0e20f3419a0a78be02cf3fea6a28ff1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:639eda3443f9916ed65ae34d8049b81105676143313ef0020707d44d18918729
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b31264e3b0b68200b0c78f04ca6861b86c879539
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b462a47ac93737929a972452876995de9c4ecf5e6065bd3c32d5360e433f45b4
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6dab14d0b70b4d15629b1ee8025250ef7635694
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03b059ac020b70072151861178a060495faa4052aa3b31de2908202056893b78
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..043280d9004f4e830017aa8651672481e953c69b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad85ccee95ce0df5beabe3912675e6d008e154fae66124c5518eb39be8acca1
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9cb9c91f5eca56b257197d1ea23d0792c88207c4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f0c2d2e3c4c7a1176507b807ead4e9edfd2c74f54a97b9a948823ea65640e5d
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21cbbe979b872be1c5d99b1e19ad381c346cebf8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc327d4929338431b2fd2c1c25f1e40dcbf1515d4f719fe55125461bdf020560
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..99c1b27a214b0969327b5819a574014e550b4ce9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f15e5579808bd650e7dc9b7939ba5c779f28b990b80a96505ee433a8ff990891
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..73e8d4af1f007f02a769021fe611bfef31294b25
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75b3627104b7ba554723ebab930d80ee77569ba0204b2080ba7d7c90951f4162
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e4e043f7368e213d269357db9156a204c3d1b13a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b1e65fe4fb4c60dc527f46f3716596296a8827d3667d2192b3d12d4adc3e22f
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08bb9f64b41767c53b238f7a2c7b5504140d01fe
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ea9a6d564d6deff35c7d9ea064daecb25897010d08034eb3f283591214021fd
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0e9f8ccbae769a29b3d172e76a0aeec50e5edb0c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d799e6d3e1e4977f48fd577d18f21c620145620068c010000111f7f5238e41a
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f7ffaf5756668d28423cf937264479af27fe99cf
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:278231c3fcd2dbeb1c441f1a3b1219097e6d1d3664462a749f1628d138978763
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2014c86f9a59bbe700693b5130108c8ad0d2f34c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d4157e4eaf955275db6c2c99b44ba0cded854a1043f8ec605134f498056895c
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cbdf889041e9c8b312884b1db6d6594b064c17b8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4cba99b095d18322b348a13a3d4a852610db2c2658f7fd39ea94d3a38620e9f
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f95b09be8e0c099e53688f613a6ac2bb60b4443
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f931cbd143c2a50856b3e425b75a733e15dea7d8ef045aa4b5485726d83812d4
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c18c4d1adc3ac48a4e6c97f11dadfeda9f967d08
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d1e857bce4aae008c74193986749630413d26035a1b03d30652b91034fdc6ef
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4b5154a3dfd0e71c0453a013221f2b58275cfddc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40eda24c75a1dec1a8914b4426c7583f1434e1e3ec3f32a77ee5ff0f2c6d17c9
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3615fb90b4743701fb3d6602eeab2d7c8d4ef2f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8970a081d2942e2e0add85d7d1d0468e7e1c64fb9da0ff512a910d117bf7733f
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ecdce1a01e19944398f7b3007ee5b06b2b8c1658
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e37975f47cc045604790d58a55af80225003cf7478391f9c6a764e26fabf21d
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21735aa8d419bf3f4ff4afa254a635c6d221ca9e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2977e4ef435ac31ae663142344f887528c2aa4ab4f1794aa0406de4ef94fdef
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1b777099e8ba760a6fb30e37fcfe1a1b2e9961fb
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c57012025edd0a7285850ef9de27e4c3be32f305bd993eb090eecd0f6a1f7fba
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..da6cea5c94926667243b32c04fc0086fa437f8ec
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:698bee2fec0e3092614eda64953cb01ee5486fdb8c48e2803cf798a0d9429e62
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..38eed1563d36be3b54a0cdaf8bbe6177897eecbc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9883e28075601219939b8c6f2dc215624a8fb20ed642711f463e0a77902a32f4
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2137473a0f3e4a687097dd9e5ff779bb74d3371c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17f47560ce5143426e4c685b8b11850ba75bfff3a8a3f000b26b5b9e23f48d23
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c727749f3db1b0fab911ea83bc791721b5ef88cf
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bd608ef5f90254753b888d72f8946435dc741b81e983496666d95dcd708b5dd
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..42beb3f5d8df267e7717b4d46cc84d9d6ec580d6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d21b759fcebc1dfdf9361102f88e1964df2e65e5e4cf9b7a2f7acaaccefb2f6
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe1278084bde50788e078601df3b9b31d19202d6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a44563c75e6e6d434ff6143d4db88959ecf457ec704355102e28aa125690a2f
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ebafc33050b4d6f6e263a1ffaf1a3f1bb7763d51
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:653a7a8ff161b594f5a6be7de57fed7fee3fd8f08c34cc38edef7882db508b7a
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65982c2096f11d5ebc7cf460c900258e1f6f5ae0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4088f4ac8c6c85b85e5674f537ad51373e4c4a092f60636f2866a4b2db8cbe
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b58acc05e0ef00ab6ae1db4000c71fcf563277e0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a56558f8b515df122d492b2f109f8e2c9e4a09ea2290b08dd58c0d510159ac7
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e21130c1a68c657505dda77ddb5519ec5ce26103
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aea803669f77d5e872bf148f06fdd762d4558f6f04e9a4f4fed9310e189a46ad
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a53c8e1604527bc6b9fb904d26bbf664e328831c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4c78fe8bd9c1dafccbeb2a019fdae1020a8156638662d8b4dd0ad2f28c7d8c1
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ba9fb3e5a5f776c3cd4003831895429751344db0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58db1866795c1c37a9cb53b4285139ba40510d8091ce4eadca36313c56a14853
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2f3a6b949e6cf7392eae6dd7436b0c20086c902
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8ce2fcced9285fc4fd7e3b5a278d1ffa01ed0a13fed8903ea5c7c6be47ca8b
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..83a1591c3ec59c9849449743c6828489c45081dc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e93d86b46daf0db2ffba98e3dbde6c3531d936d3ca6dcb329f121bfc3863b8bd
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e97dd8d2cdc5834825ad967c7aa51175ba1b428e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3360e3937759e9c502d46edffe9a1cf309f2ab419c6e95fab6a462118c7e1f88
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..defd697a17ff8c2f198fc5f35a2fa8cc17553dbd
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88fd54335cd9dc6a31c6621c4e3f683cbe51d40369a39edb4493cbb81a3bcd1c
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed17f4f3eed76eb00d3118f040eecf835c9dad13
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16e35323346d89d9afeefe96d5a8ffd0679d4632636dca0aff5f72ce25edbd92
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c13c53c420228ce43b509f8e5325e1c5bcbb90d0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4228d6c54aaf5e715abf2bcba953fe6c9db3ad9ca688c4507e42e00de146eb56
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6dfb00112de48059e3106c98643f5b7f6ba7d55
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:858f782e6ab6900c3ad9037055b4f2e6378be5af5f68c661b58a5e9606d23267
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..314e0abd6da4a54ef53a046510303f8f8039d8d9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13874650d29861f4837fec82ba07163073c0f350fbf7d1c34d3d7a4c72d1d03d
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5545609e3af8bfe75588fd862ba7d3e27b0cd6c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c5720864a08908bbdc8bcdd025159cd725b95e08f28375951ad6b0874204a0b
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5581d896ba1602be486a03106b5151c9d7a8ece
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ab0df65fe03c94944d56a3b5143d82a00ae9eda1363c94c08476e841b61cd9d
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..50a6f44207a2b5fd466b56c78786d7450514f5e9
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f25d803ae3c74445e70e39398b3aaec21854cbddf723d3450bde7cf01d45a66
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21fd67edb12012975bf4b339db564e1e51238c59
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43e662c691c4a729079911d43a57bc5889e33aac3c78c856708507b84424a835
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c0c3fc99be87197b215f730df2adcdd176ef7f92
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c004122f3a0d3ca12763e89fc4953fa1e2ecd2dc0bda6f290334f68b8bf09ce1
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5660208bb4b5cac438c14a5a4700eb29ddb80de7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53a101a8cb867ae3a5530c357a9e2df686490023f56bc92e068e76e7ce752592
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cea1dd6ee6949ee5b9e00c5b4eccf8830bd00a18
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77ec2119e9819dd7047806479ade35449f81a18855761ec1b3338333fd9643d0
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f3fbd1959222aa48802ef4a823fb085322d5b14
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce6429ed08840a5663068041470c2a627dea5f8d9e3cbe725b654b5b36b2ed97
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e6cabd902408b4e4c049e7c01720d3d96870d3e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26943305e502400c687ced9664b6348645c19dd774989343b0031035caa7c641
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3761db12f3a24757cae4baf03f8637429b2546e2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb30cd457429e08edc3b0c515c01271ddc53af094a25d8ab87ace52112cde1c5
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d6cc33f72e0d4f3b5ec3986555ce89283c6d6344
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b91329cfe8b4a5c9b6f47ab2223141bd85d3cc372dc2ae2ff10e5f96a51ad21f
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..836c85f5c0f38b6ccfc7b9c244a208161deaee08
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e2d3371d9fa1553608336bf2a28248c3f7d73b4b52d29e1b2c4fd9ea552ef69
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a5151e0c6dbf18d79de1ee4c79f752e09ad35b7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa3cbe5eb80a108df472bf449c49ca1d1fb5e85caf319af4bbb3342dae85b5a2
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..668bcffd9c4d7b9d9dec69dc9e3f075c96bbd18d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f4d6638d9430a05289988159ccb6e2144e2347af3a2c39d028ed91590bf8f1f
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3fba40704e35f8e010ad6a3ce71bfe720d08c09
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fcc7a19999dfe101460ac236d9cedbc754dd7beb58980653c95b7e75a08c822
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3d2454e4ef84ab798225d2321d25bc12f2a49450
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd69de9b72a05af2d3cc0789340a4449070c60bea892701abaa36aab77101b1c
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..82a21c591d99331ab00cfbb87712efd0790f5273
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab21dadff8a6612bb85fb7af857f15d6d3af21ba1a685d5ff280f57210fc520b
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b86a05cf2dae4fd3de1ff58c02d3d065c4bb25e6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cc2861896dbd972d77943a23e3abf20467530eba43d98741746c3a94a65f235
+size 199058647
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9c3ef1fb6ee4e2cd045928623dfa500bc1f605c8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feb97f3e554bd73b9987394bf8c1f9433f1caa09e25093a9ffbe6fc97cbe7ba6
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe0e590fed388f409668bba1aac85cf163481fbd
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27991e5126860cfc698dee51ec6c84ed6c3567722a572f3d3802016d9577cae2
+size 199058850
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..685096c2f31ad1de6d9306f6631004052e1140fb
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a333ef5eb49524da981211e6e47176d2f6e850c9f1e89b40bc12da965a97ac22
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68d1d80241a97734675d79c6390b50a1f65fea87
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c77def45a9007699d82e89aeef83f4b6beda7480bbecec9c583e61a8119111d8
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7925c08bb07c9f4f72bf23148123cf509167c09b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3962d5c4ffc09446c2fe5842746cf96271670950701355ec58656fe7d4417bf6
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77f9539764ab0fe3cee060a89ba8ad4f2b877a96
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daae7d49bfb50b47ada9af503ad78dc54aaa3d988b141b2b50261476b7a2f2e8
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7949a860b3d321691557a6809d01682f05ea96a5
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8fcbb0f58549ff26fc0891b0c4aaa5845ba5d1f60b2847706837471d719d3bf
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bf74db54c9a302eb6837d54588088567daf72923
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cd8d171dfd6b3ceb20fdabe00f847bc96b8a9608c50255c65fc40190ba8323d
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..78824c9da0a6ad2ca9c05c8d20fc59a0aa9f12c2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:faecc5d111ae88427962adc269152f92405cc2e4894d10aac8bf220de29d9361
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed177cf5f0b223b2e71d1c2e26b1e406c359e54c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f54f46df1260e308902dc5185f0b7b63ffaf76557a839529a74b1942ec568fff
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6f4016574376c87e2783c4851784cc627a4a02f4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b9d07fb799e13095805a9cf0dfbb3c2af46909d35abc62e38c30b14b1f9cdea
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bdee36ef306e53aee94c2a4701fb8b00fbe867dd
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98a1edbf1e3efa785ea4bc5ea8429667e38a0eb0d8eefb03b7ed926e619439d0
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..74bcbf65bcde8d24629f976f909a59639d7469d2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2eac81b8e49b74568184373b5d1bead43a0172d8441ed51a3240cfcf21317c1
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..86a0d2b76588d36d31c5debc5ab4c705c0748c38
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:889f16dc38af926623b4841615975b74d72a5746f2719d4e03484b4d31588819
+size 199058722
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa6c9061b046fb728b91ba385e2d5b2df10d9c7a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8499446f2c95a234c74c6d705ee9d92cacad81483726bf000736788b5eeb2c1e
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..caa5141710f52d202956af5fa827cb723b0552ef
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e74519c037c7cb93cce6f57b233677d2af7371e939ce7dc0e90dd68be318cc5
+size 199058786
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e14cab86feac012192a1a56a1a4dde4a6c2b871d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b534600067ff74edb33ce65c8353038fee2e25f293c1f2e1f4a87962a397409
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0e1dd3902a01a8989a3014f938c8b5504962d58
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f315271bbb4293c80fbba4fed431f914ca0f9d0616fc8569898f8278465b99a9
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1593a7bf4dc3fbc3b363fb3d5968a65ac825a7b8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f02bc54a805f55ec2c3dc8dc452a895de4df54ed635c841be69b7d0f6d6b2ee6
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22845000f02e6e4476cc563377a7d266c5ee05cc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53e29a48ac6771e8b204b268e13681fa46423ae81e75c503d9759e436d86fde0
+size 199058658
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bfd5d335c2617e2f5439de2bc9bcf74f57410f1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70bcdb184722a2b8677e6a1bfb2554c49a6b5ad238e9b579c747c134215ed1ec
+size 199058839
diff --git a/4b284b17boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de5c15baf46f5381432b3064d7306779ae4928bf
--- /dev/null
+++ b/4b284b17boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c1fadd29b3fb669ca729bd445c8415822f4b19c8305a7196d8e35175815374d
+size 199058839
diff --git a/4b284b17boscar/global_step80108/layer_01-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_01-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d6280e347cb792c5f3b1a623dce76106c0b31f00
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_01-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:481b9e26697a3f02c22eaa88c07147e7e4bb48d836b241c0eaa66bc36ea6dc21
+size 167511299
diff --git a/4b284b17boscar/global_step80108/layer_01-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_01-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa41d788b592dc0e30b7227037818824c4014c26
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_01-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:097a169dbb3ff2a10127f6ca0ea13a7e040854ff39c9a1d4ce555a3af8deb284
+size 167511299
diff --git a/4b284b17boscar/global_step80108/layer_03-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_03-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..266e9e43c5236f3f13325f512a6d90df35f2493a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_03-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:337b1a0f5289a9d7f2bca7b911e70a09a3962dea7126d1368ba22b496bfd3705
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_03-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_03-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01a7cf4ade29d75a3bb7eebd43957d7ef5784647
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_03-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59b7fa04e26685869a31468261f814458ef4b43eab1b7f3246203b2a21509aaf
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_04-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_04-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a31dd40876724691fd39df4a8d9b98ed6943a1be
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_04-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdb3c5ea8125b73dbe44721139477ff5f5f94bb5b51f4fadd92e56d2d9f1043c
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_04-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_04-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd62d06faf29f068e36c20a40b0e426ce43fb46f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_04-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4fa6c4803060eafc9548fce63a9e783071873c0a688a87db71cd1dc24cb8d0c
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_05-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_05-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9ca11297ba6a98c92987c6ca9f554256fcde3bdc
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_05-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:195ee512e3dc2a858fc5cc616cc348c700a8bc621011791a0d57d88748cb3b1d
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_05-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_05-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7540b4dacb113b8788b0a8262398753f047ec399
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_05-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b6e947de9726d700015096918543a58e3fd4a3d7bfc67826b13e47482fb53dc
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_06-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_06-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22e9b18a5ca268e0e56bd7fb10c2d34d5688976d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_06-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db599f075f33c236d68f5c971fa014c03d6d6ef68d1908b9a8eff3e8085e03e8
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_06-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_06-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af6375d64319611b2f65f1004065ba60fc45ad5d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_06-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90aa559eedf1e324291c95f4ff8d3dea49ce09c4c011000308c5ffb6204779ac
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_07-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_07-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1042f5eaaa0cc5fb9bb889624b2073c1caab16f5
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_07-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:973c9b34fcc383ca50367150cb4832937ef12d9db0c37718fb83b59c220359b4
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_07-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_07-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b0d002445bb8113f4ca1f327d8da22320267c6f8
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_07-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcc3627746d899551fc2ec93173ab438017aea1fa89ef5e7e05ffe4a9d84f9f6
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_08-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_08-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e97bc84cdbf5a82c1e7cef2a237d37c7d89d8ee
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_08-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:923b40e4853408a7e54f0cf1d32737cfc16f099a42e83b360803d05c987ade91
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_08-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_08-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de1b85cbf022021ad3fc9a7504afa285bbb8f814
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_08-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b72b972a825476e854a0894a2e0b3dd2e13b0babae77376d8da86227374a79c
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_09-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_09-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0db74933361d82087e2de5cb6e5915bc4d9e6e10
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_09-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a29d6846225919cd10a37b19c49029a0be37398ea9efac22114a9d46290c74dc
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_09-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_09-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ffc1a24095592f80ec328aa82bf188a3a4d82088
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_09-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fdfc2f50d5c301a8cb486cd9d4c16003edccc0ca8347758d83c1a0066601197
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_10-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_10-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f8d17c2dbec826cad6a0466e1292ab9a3b2576d2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_10-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be187ea890e308660c50f2631d450ce792994b1af02ab4cc7edd4a4d3d4f1f50
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_10-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_10-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..18caf9727adcec2f7f94856cec77c034fff505a0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_10-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:490b5307e1459e79b85cd8920a4b0c7457694346d05eec574b887922884f50f1
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_11-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_11-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f531b498de5788c6e90a06b015977fa14732d35
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_11-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60da7acfe76cad9bb26c836ae039e0ff96c2c66f8b4a08686ed016bc1fedc1aa
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_11-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_11-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9b0b32c59dea1265d8a1eb32e6c748058eddce6
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_11-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:217bf0f86d97b612c903e2cdbb3f8c4f26fcd76a0fca9e46c2bf9bff078b3369
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_12-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_12-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e14e5692bcffb52769196caea3e213e3cee818f2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_12-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b37fd1700c3de47e90438780224d70816b3596bd14d71a8328a0154643984b9
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_12-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_12-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3845e8be93d5f07a6be79c4d82298927dc8e6e56
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_12-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2684241773e2abf7e0e2fb44f48c25b2131757a31866c604cf50fa36761f4a41
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_13-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_13-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bec330cd324b5b0989095629bcf17949b3b76985
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_13-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc395241b8d7bd5b52f21414446a1610440bb109ebe444aee5cebcb49ffde6ea
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_13-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_13-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..276996135735cd9bbccf64eae0fbb1fbc8596450
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_13-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:178084919d53b4b12f8b91c5bf29ebd43d9b75accbca1520b06ba4addd3bf091
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_14-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_14-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ab14ab620fb275dbbe8b95c1d1d74f62f938fc76
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_14-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b40bc1b4e58bd1e3bd626b584e17fe4364ea75b6bef7144ff30defc59c7921f
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_14-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_14-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11d8b23883a435696aedf2a691528937b3b18c3a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_14-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdd76dc075abc840477a52b87c30f785d2b2bf1aea1e05b9698038b07de5489b
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_15-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_15-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2c8f8e28ea38e992e78f88ae75f383624a33819
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_15-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef56566c9357c2f3ec63c14f150ab3e8e5abb9d14a68bbffe4eb29ab83351fd0
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_15-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_15-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aa6917dc465e335bde53706ac17bf7c6da5a1202
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_15-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:170710a09b343a32a98b797fec269ae8521fb84400bd7e376831ef4c3ee8a9fd
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_16-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_16-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9029722e88fc4e41f3a8f9add791e74d0ec4fef5
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_16-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e286f9426f341bb05af37e8ac7fbc877d0ac513318aefb202162bb1974fa9f9
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_16-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_16-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e06fa51e67eb89afc855c15bee447f2c3e769f69
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_16-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d5615ec5870d85102bc61dcb99f8d7cba24526af6ba4bf55ab45426e4f4d990
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_17-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_17-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..46936f3b95bf0c7d48cad01a822627cd57cff879
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_17-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91c0760ef2f1a1d351fa3713042c07e4810cb0f1884e12bf9348c694feeca506
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_17-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_17-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c2597f3e4e80f71a734825e113bfcc0c517471a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_17-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e29ccaa613594ab9811183a342caa7682c5414aa5388702e4d8d99736d268db
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_18-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_18-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4771bc0d8f1435ce3a74de9dd627cd2c186936a2
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_18-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a3bd1c9252745b6bcbf47021e8162357296b8e68a55718439d34bc27b626bee
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_18-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_18-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef39e2f7068446e3c13a9fcd876115b309cebc91
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_18-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be8b3ac07e43f2e1b1f99445e825c0bd1953cd83368e3252b5937f1d30f07a38
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_19-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_19-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7cf9d4a9255663e552df8771131de88e680b9254
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_19-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5c657da337401deab230fdf35458c03ef7c0daac0158b332bba431120e3b8a7
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_19-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_19-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c61351346ce80251d2c7ba72342b2a5f4f98736c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_19-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98052929acb2f155036268eebd30e7cea9e6fccdaa18faa85273ca9b6c5ab5ab
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_20-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_20-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..684e459428e8888179fd26360a7e2d6f4acaa6ac
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_20-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb977f4918f35fdc6cd3b21dc1864309a988c2181463908748610450c79ba800
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_20-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_20-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11c26ca211ed07230cfe392cd0e35c52a1bb50d3
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_20-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399f96c0a4bd380a2da2047a47852eff4a7e0d3407c74919041accda2c145c1b
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_21-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_21-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a73000d922ea17712c5fe6f2f6022d99cf6c656
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_21-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8d5343361c1d63a32a1f3c297e37c3191f92f10e35b37b0224e42da2e824625
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_21-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_21-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e3a08010786ce3f5ae28778a073defb563ba842a
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_21-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f1bafe1baa91b66c1ae2b4f71c9c799052a5598b538f1773bd4ffb586c29bf0
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_22-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_22-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63b864046ff239a0ef7f40c2f13a873c3367f0c1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_22-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cadd49a5b9e7fcd04315ca2f7aa2b1d4709df82c7270eac8297ad378c49de6bc
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_22-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_22-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..02324b0a14e7e0033db9c8a7f4f4fc19de77fcc4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_22-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e40cab65f06e506f0507ce365ea017b43637ceca798be211dab2e126a4835e3
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_23-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_23-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..171e79b0e43e385f77ec5f56736b1efc5c6808da
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_23-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9969718814c812523b240a4867e3706982d16b321fe26519d2e1ab31c86712c2
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_23-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_23-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aa7d6fedd8644331fe0885776d3ea102a1d2b843
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_23-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1551fec45171a466ab99b12b8c77bbd64eb016b082f50f25cc37dc0e98533bab
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_24-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_24-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c300ee7df956408038026fade0a60669f9f69ce
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_24-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c12129a7ea03a2877fac03232f2f823ac82bf393e11051d5fab06d427948a6a5
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_24-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_24-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8056f9ace838e884b8d935f3bea075d098340304
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_24-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0148227573ee4ae5ba5c007ce957415c350145d63910645aa402f7b4aa1839f6
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_25-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_25-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a81689f55681001d811fc864d405055b92c948d1
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_25-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7df5a0df80f1e746726e2135c02fbaebdac526c0ed0bdb9cfa8e6914adb8513
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_25-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_25-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..62b2a446cedbec6d9777c27094455f61ab489843
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_25-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d5411b4f09675a13c395e939dd29f7ce2cc32279404dc3fff06fe9e0859c107
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_26-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_26-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a8c04ef069e73b9d39bb87551d850d836bc8ee3
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_26-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f3e04baaf7445b89a91682fabd1d6965a83def6a1e48a9f2545c465360f145c
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_26-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_26-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6f9d2ee3681d0bde6ddec7ff1fb625ee8b5ec752
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_26-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a266cc407fb7d82ba645ea7092943e9592f78fd0a8cbcedec480c69ef935707
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_27-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_27-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c0fea983d26a40cb7c9a65fe5951bed93ee0fe21
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_27-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:772cb23941fe112f7d8c128a1f9f6073411555ebfb48a496009e1260eab4664a
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_27-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_27-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09de90b52d1ca5ce38bdb5efc843dc8b7207a8a4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_27-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0650c9f3b44f9889330353ebca1ca96f4782b660eaad6ecf8bba5d7a6f82f6a5
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_28-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_28-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd1e75938acf889fa572e82bcbcb1ce0e13fa22b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_28-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65db800aab9fc9a6be4fe590ea75e772d4e26cd0cfc933079d8f7c0e2c931602
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_28-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_28-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4b6a8e3cbc1686626f8a277a0b7059b7bd5e5993
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_28-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf6ded178896f229eddcc6035e25a1737ebf6f20f471bfce170a7acd4dd01279
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_29-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_29-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22ca6ddd76c52ee3ed701d57437248b5abd7c902
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_29-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:049b1c34bbb7bf475bff8d845048607f2bdde615be0d96555efd920675d6b044
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_29-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_29-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1e051d43ce977ea093cdeecda7928f118f0c9f4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_29-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64ad85d022c873183afebeacd521333214de3eb67e49ae70e3feb44b705e8f3
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_30-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_30-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..427cdce072c044bf72832dc0d549c8d8bbfd8923
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_30-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b296952ee21b4cac0e6e52847c4a2ec5fa736ef6da843b5cd812f046f0dd681d
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_30-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_30-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea2f3fac9954428fc776fd954565bc2d1c9dcc1e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_30-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21c567da97c7c27bd89e310336c52850d7047aadfca99e316682d1705c056956
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_31-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_31-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c770e54c497960dd6cea3e0eeac8378e5199e422
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_31-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51062efa425f9c6d9e40290fc2d35a80f98f127e1b00fee922111797de792bf6
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_31-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_31-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..907d0eb699dc5465aec683cc8989bcae67602ae0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_31-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b100bc96b28025974bda8133902bb78585eab1477bbafb94b279a7125437de65
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_32-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_32-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..57b2327ade97d10296ac9b77680599772418f7ab
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_32-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6a57149af606f95f6768e15f534ffb4d6913f6cfc811337121583189c995e8b
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_32-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_32-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df3292c901a8440fb7d72e28ce69fba87330599f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_32-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1605691b17b39dd2b8b49fa6b520321bf1d6a02ecca781d9452260be4f74ecb5
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_33-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_33-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eabab9a918591673f827610d2d2d6649917a924e
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_33-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3528f5cb356c6ef74c15688d89428da6e1745f12bfa92dd798a467e46e81f85d
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_33-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_33-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a3bc95b94c22ee6d3ae943099ff557c3a6f37d4
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_33-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be34474c069e9e4a51f4ef4e457ade82e1cd1885ec63c0309c3bdfd634d00db1
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_34-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_34-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8f847b2ec0e75ebcb259c2ec94bc291414276c0b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_34-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c38e9fb8dce1c0c74ac62e17bc14ca742514c026a4a36e07399e6b509294d34a
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_34-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_34-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..26945c4d5590a4e17b1f13fe3a0e9631de9d254d
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_34-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65d6122403a80b61da6e8749605b0fc0bf967cacd78bfd16c64bcc8c59c63b69
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_35-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_35-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..619a8642c1f4f55d8b6dcde7a6d9707600b7cdd0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_35-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9da3d84306cea22b42a4deb18a16a7b8554a577013622f590bec443cb63d748d
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_35-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_35-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ec1f52ed0fc061076df718aca921faa795e719a7
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_35-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9c4b5e9550ed04dbb9c473eb60d2464c5bef540c7fd5440434782d79278eb61
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_36-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_36-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..927549aa05a03e4b8bed2873ea267c4a3ffb8c2f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_36-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a7b960af6d7f8667756531b826087f899b1d3c75d200aaf23018e4a2443cb3
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_36-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_36-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a5571c7de1c0b70c079ae7f35c611ba925c5fed
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_36-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df1df6a4a47455ba56b225c01519bdd1f2af4c78daa80f2f45122be9384101cf
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_37-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_37-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7f709bac6e2f817c801ce55ebe7e29b0ca65699c
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_37-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74140de4ee8a48ffe996812aa604461424648bffe68895b2a71d89260f008c0c
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_37-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_37-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01fa7fd7d7a5eafa587cf3db4c08b5f6b721f4ca
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_37-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3692e6fe47ab03d896742ef20fe0be4a7db01d4fde040a34f7e7f511eb7709e
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_38-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_38-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..711572df58fdb4db12f9add0f46710b10696788b
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_38-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8edcbb550d35f5e55b57d29e257bd9d8bd1c1df26298d7fd257a9cfc199b10
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_38-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_38-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..73dfef75868c7679a2db82162fbb80247a78aa80
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_38-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:027772be95600fc8d552a4c2c35ef392946499c515e0e69c23429c7095eb02ec
+size 113308931
diff --git a/4b284b17boscar/global_step80108/layer_40-model_00-model_states.pt b/4b284b17boscar/global_step80108/layer_40-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..733a832bd8b462c238e482d9fff859b15cb902d0
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_40-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:deb9bfdb09055154661251b3c2e06f84d278c861e4b38c4e7998a0b4c5418a70
+size 13507
diff --git a/4b284b17boscar/global_step80108/layer_40-model_01-model_states.pt b/4b284b17boscar/global_step80108/layer_40-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88f5244157f4bd920e15fdbec63db08dcc32cc2f
--- /dev/null
+++ b/4b284b17boscar/global_step80108/layer_40-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0e96ef46b3b3638f54b409c114012fceb8ef82b083cc59214a1d0d23c4397c6
+size 13507
diff --git a/4b284b17boscar/global_step80108/mp_rank_00_model_states.pt b/4b284b17boscar/global_step80108/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ab3f3ed27b909aaba31122a2cad7f165c8b4d2ba
--- /dev/null
+++ b/4b284b17boscar/global_step80108/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adf39491bb8c331888deb5cb6afd252d1a2252b7c13439d9167f0df05e04fabe
+size 51635
diff --git a/4b284b17boscar/global_step80108/mp_rank_01_model_states.pt b/4b284b17boscar/global_step80108/mp_rank_01_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf2eaa57f00ee1103680878c62a42f7c8b02b188
--- /dev/null
+++ b/4b284b17boscar/global_step80108/mp_rank_01_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5641a638ce4c328513b96f7b7d2ca0cc8e219ee4d79de868dadd3987ab8ac957
+size 51635
diff --git a/4b284b17boscar/transformers/config.json b/4b284b17boscar/transformers/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba
--- /dev/null
+++ b/4b284b17boscar/transformers/config.json
@@ -0,0 +1 @@
+{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"}
\ No newline at end of file
diff --git a/4b284b17boscar/transformers/pytorch_model.bin b/4b284b17boscar/transformers/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..60ba2a3bc61b33f25693df58fc7f7d5df9c1c95b
--- /dev/null
+++ b/4b284b17boscar/transformers/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99777a5c1cecba126583c43eb2ff78aaa7004484638a57e581feb845aef97593
+size 8781203669
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..085bb8d1616a27a92f652c3effe410f6b6029ede
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3119407628991087, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02839548709754242}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07438946576288918, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025513647065143596}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.2632468762241862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004868249221352049}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.0990806449013871, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021415843483912466}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03312917849091313, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014174063689487894}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.12235027642168965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031144296943736486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04529204402030828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001254298305442429}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07117230529834999, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00236804160887153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.25570352079335334, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004721463101193061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.09534482558563832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019930933840825046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07060394534578683, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023788303493259178}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.25086964131360756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00462453643406655}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.09431820077676166, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020087380246042903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8e35df946b678e595656290c86b87adcd7b5126
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5375007952609668, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.027971198631201743}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.1520773825012676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004748290705934928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3227263264666382, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004944101731638509}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.16868686014694956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037497452245084024}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.07832284618044694, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0031613171494569007}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1641704139450893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003490737203218088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08499563989738917, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002483035814872989}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.13745282575305529, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004251980584200159}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3019875028781424, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004563649108443115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1534335691189539, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0032520482964081736}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.13987460558088444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00433835517591582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3044600652888652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004600132970650535}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15571056260264077, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033278332133086664}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..26ea1d0ce395d37ffddf478ac203b679c4dfa142
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7869498474646944, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04640396849840582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.19796396527381735, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005701278438899864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3706483303216963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050173611899764405}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.2084575980044784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004415820801430582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.10467370587595308, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003819065495086435}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.19711908309071768, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038874252766392605}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.10954670655177427, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0031308589770994762}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.17592108046935254, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005006735502669654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3446684722385714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004645081391462266}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.18766755781965408, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038435725948238425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.18060870457183115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005158605822048483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3494009114809856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004713912149914969}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.19187903453194924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003958689064769724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0016d455659aa54a3115c21df15505c8ca40d674
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.8781645295094662, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.023295523644902574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.20253583048997179, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005667155356549766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.39458836927395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00508328824891432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.22142348005369172, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004576657734814378}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.11025140845494157, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003876483056835859}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.21020652499163808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003888206554440605}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.11757388147146428, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003185839232819997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.17821674790607994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004906399590419944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3645388282549846, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004646105275227094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.19753768298479737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003907531687545351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.18364695968481887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005076299243812921}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3699350432753926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004721581387764664}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.20241133896176328, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004042906289046454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc88910de579205d8abafda2554bb31cd658b362
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.9900707197163517, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.031011845880251587}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.21624013657641925, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0057958787897655845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4006689051971167, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005025377089358478}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.231391578826814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004602731955828627}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.11741133357899738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0038707531452077852}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.21636504595775147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003993049382207552}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.12321787830184827, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0031814702327693906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.1866671922061064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004886171018823189}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3671506160865042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004552611209109354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.20337113659463418, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003830528444749543}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.19479702587195674, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00516733338200102}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3738824304063063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004627178059919308}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.2102687170624152, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0040377255274080814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..856fc366977b134524aa63998d6722d2f8fbcee6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 1.1661469259713872, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05442011585382653}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.23542620246457463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006120612746647669}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4126382803733479, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004969059062845303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.2462703929154201, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004784081954132341}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.13414542688077327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004317950948756051}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.22644337464507955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00400180902077526}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.13531236031865498, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003419155242253958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.20531156621874325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005277463604539717}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3773305254767969, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004505345415815039}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.21699579683860706, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004030577049307887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.21360232440350232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005527478664092985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.38494058675880927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004590859610063705}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.22427477226716616, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004231158968354307}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d9ffae5a1fbec17d5e1561c531263c10b3e3cc1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.041346373718141455, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010599623460339127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.25211310702802325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002788455474554402}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.06377388411846296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010349413916477116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.002839485845793308, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00025451562903735196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.020610793870894942, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015955007557906615}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.004534425826606448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003879492990087019}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.03934308508221285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008489690182718232}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.2494389106117795, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027727843463422105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.06183696371033167, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009056912304361808}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.032285247594853815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009220039700099138}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.19883286423228544, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002350462700574554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.049196335508131475, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008582052072391212}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.01632350783277848, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.002358717732716759}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c26408a620a2b91c4cf0f905d865897897c89a55
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.44445132729923204, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006526455541829097}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3889450416416867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005166406539461182}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.35911029870880495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004787852480889677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.21846220594720275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005005245010753829}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.18846751093405656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004061844071462999}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.17232524259944224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0037325291663246048}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.36432134727711496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005635577543148762}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.3229414171436671, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004531449966392583}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.2930536919182777, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004002123970091481}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.3866016578521912, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0058725916797575575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.3373116717187921, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004582899451391632}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.3097862182227815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00415316763055169}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 5.568248943628284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15936131698908332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6840ca970c89b49f0d05315686c089c4b74c252a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5921712622610545, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006080920495444881}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5256791287055436, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004787960765756497}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.501627380148766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004626333035685351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.344221483181946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005178410791483312}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.30078742559239424, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004299159471007747}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2868389890412573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004120089080937723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.4827491624545944, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005485212821791447}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.43467949164870573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004513595168941902}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.40933775812934475, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004227695419911034}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5175873252849847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005679308606153899}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4594793391268419, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045209528301636695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.43681937401317794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0042947259171227025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 9.327080848315214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33074776428543545}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..aed4ee519df405595f13562b585b1009f1817776
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6196402629179393, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005522213166681634}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5364136875813985, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004681515049484617}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5296729046946752, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004241547746449502}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3662989314092075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00496675265113463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.31326806717833167, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004336125727613282}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.30821667509197176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0040240686439643155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5062917357965477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00508050519245006}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.44263683330106596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004460129545645125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4331671556081239, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003993476175551676}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5432394466733573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0052393692520715275}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.47055228809438604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004443920194380233}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4628894235546133, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004002328864605658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 12.36334348883012, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5943366673610281}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6ed7e11be7f68e382d19a0bba5f8f0710081a9d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6365786298113485, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005343609526718168}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.535103469511539, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004748804702692128}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5402829312776871, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004197339364184242}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.38031962461269936, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004916807569648106}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3171258246836024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004430877750976238}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.31819759490776656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00401669725008511}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5204631665257551, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005057831046431135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4411323450746604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004524079062455917}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4417213853690807, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004018127749079752}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.557753905383597, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005156190226040093}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4677559934260435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045047882754511446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.471525411066689, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003997505234098393}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 13.83666464772769, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5924556563703853}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3edc82432442763fce0190339a06a7046eccbaa1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6510118389549906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005033868263865303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5398764737436825, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004749394572676228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5504705447467583, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003931470606428917}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3895468651617418, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004836511207740123}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3218179502144929, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004358671474235282}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.32523157643827644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003905427714337188}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5320306794072402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004844722059279852}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4446621595631789, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045022387221370155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.44940514442893853, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003799590605940088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5716550921263407, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004948958293419547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4718974866236728, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004467252768544588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4802802956854494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003755409047032572}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 16.027089803530085, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.4299479515875188}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0427b4fce84e928677f46e20f98e4ce033af08f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.497657873618323, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.012349325099936713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.04224352625871548, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010010170277480635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.2907403847735972, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004512637085950553}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.07015638110137974, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015061107866413254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.011843871110726717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004944309477955418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.07756192597479347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003065548774425208}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.019528844760768004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007823549661191675}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.03916091211293898, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008415465548236258}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.2777346312680996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00411812624577053}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.06537354397270882, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012792117359157506}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.03409147439298827, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008881130271259737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.23208716933748721, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004219597703853115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.05643295459659358, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013478369359460178}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..52b1eb615149592bc6f3d2bd60cf1bbbdc35e307
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 7.427390130114957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.43368679373129737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.5191085718453924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0062926152585314095}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4305337819646001, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005069927095860342}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.4118960520453098, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004628613219555022}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.27355769670764424, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00502844636364607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.22569201243104556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004180857852458944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.213133329600957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038153064397342023}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.42309183690103064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005570286910843408}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.3541307751631573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00452062225618094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.33429534379484965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004021868671783561}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.4514869427389182, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005793363430064205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.37291654050475104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004610348051058544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.3549292721389946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004123801774630897}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..08052ae842dbac1145a43858220ab75702e28d22
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 11.690802198271522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.31962481261541265}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.60938885297845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005736762572015516}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4910912194202063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004863683461648982}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.4931339577050635, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004313965951232142}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.3538438266464095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005179933434688372}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.279763244723918, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004204372995712399}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.2805748499547808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003992048922184352}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5022619497278473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005313489957991849}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.40736431997199174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004488232749862428}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.40541177161378195, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003964253611695142}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5338817231626088, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005446991150797629}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.42899667780520745, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004529150486034353}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.42949056532886654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004009506107488504}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0539e57c0d9e56e2c36a510b129f620d181bd00b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.015729199684401, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2861398682078767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6338330153613686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005504406796192468}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5032857728045119, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048083765697394315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5162716716149718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004182805028604736}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.3755638462897083, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005104018049016375}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29412280640119604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00434667917909}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.30061442097006785, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004053287242140008}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5245847967636375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005180633927285279}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.41888362437737175, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004531219879037914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.42681373245384246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003970750290746653}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5585561781062417, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005301431671802071}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.44153744457573163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004518919451629006}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4522158954741629, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003949519994979285}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d978868d8ceba8d71146bff9055ca7476ed6cc8a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 15.057505679492566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19539023179986367}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6455851098568031, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005274725748277269}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5097818550726657, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004854689261667029}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5273437410664512, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004079973205520906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.384680042250048, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005024147317886237}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.30024070682614185, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004425376116803175}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.30881180787788937, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004024262874364813}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5326343859410355, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005055378455270224}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4236581068981246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004613640724561802}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.434864604939784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003935277234180222}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5687274941110855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005165852198122328}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4465950275423008, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004587981631618588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4615325857376345, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00390879554622888}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7774ff40833c4fde7b6eb887ce1c7c404e4ff03c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.913912914173546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.339827970669822}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6523868918453206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005062625589297553}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5085753634275134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004805104401460468}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5325855516384019, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004001597007402866}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.39050183995250265, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00492489283628371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.30262983728520415, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004346922166045064}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.31441853558362365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003975943025845374}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5386923972401838, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0048732094440267345}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.42372630591396704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004547327839305676}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.44000229896478804, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003849076820560861}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5754652293985637, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004984504054500151}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4472244706278206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004550869580659433}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4675136302012598, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038498013228786825}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2aea565141d31a2338fa4976edb65d36442fc9a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.0314881766646196, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015126365359450058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.16236274800989733, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004098228994255239}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.045274028182706996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001450220774744457}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.006796590729668326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004941842666750881}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.04018995353352481, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002384955095113897}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.01098605507786806, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007511759228558155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.029272367859116636, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014275614818918265}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.1558380415065589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003946147985654718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.042154554721966754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013001281158823512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.02834368778173644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014346203472812527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.14391153007861998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003595338406210201}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.04020710158810906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012878319091503328}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.1125579135927861, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.013714020504806836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..685bbf8976c223790a8930370c14f8e73a34ecfb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.4453173150800215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005867312457939903}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.43803748852474483, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005340184338954727}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.38692686155050754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004548113492644709}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.21705422098748592, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004510213787992247}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2171325390790844, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004282000275127611}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.18729231846618402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0036584106265992998}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.3651358482383703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005118474245656637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.3621730834168619, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004648257519661906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.315850989957773, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003848944382861772}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.38828248191877074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00533391149324851}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.3815163353147994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004780745611029048}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.3352406893630061, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0040092153733560084}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 5.608709262619956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2845344912673943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d02086c7f8de0890d550f778f65d6d9f479c26b7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.594075097469876, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005735614328570698}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5189486365321683, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004964847282332437}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5005214985653731, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004384268530846718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.33511234833300463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004901792389818218}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2910349230767627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0043654656293511465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.27862716831146983, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003948265404921788}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.48216021689966443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005175532998377631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4271540769347934, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004639021202183366}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.40637383732751037, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003976359495955301}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5163251014695897, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005331549511342829}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.450327008993731, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004620001309284672}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4328365405216621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004021762465773053}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 10.191385722066029, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5616979090322316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bad436a84d45cf25ba4a219ef56198363089b149
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6422175208003964, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005158556091166668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5220235781715165, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048808827264682445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5327228493532781, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0040829606282401934}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3710515262643343, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0048873929905967495}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.301460611554934, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004401012474144791}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.30446134771742095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004015625700663121}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5224004508992472, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004880222498755768}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4277850121139605, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004564467034606729}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.43314386481790423, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038756825295561946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5597535897029906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004958294292183076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4537322290003083, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004551965114749363}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.46163771703476303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038162152037734846}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 14.527790269592327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.4446902271294265}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..43879cc1cdc6f3dd162bb69b126df714523fb28d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6575431547631965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0049491040890518685}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.525349510502271, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004884464622105695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5434372041612334, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003978622662539026}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3840813340290783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004787008650855236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.30651040101987426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004431179075251621}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.31365258819037656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003924794819481102}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5336789314225935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00480848453447696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4285556303170423, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045947492289961775}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.43987603799527164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038197961382302934}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5711707553995303, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004829283144054192}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4549215035973506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045583348223674925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4695951533979971, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037684239929364916}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 14.718574394473302, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2197487642536066}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b89ad7a744fc23f8fb40e70c33d22456af101dae
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6733576969554074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004783096353879735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.5251265497060836, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0049951468314383225}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.549668413334997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038761839348562285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3989223056863438, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004801014921009065}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3096200378794593, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004477481811131}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.3207540410972429, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003903119960695521}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5476435195138217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004678577775970766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4306118288590538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004703335999269713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.44712303244722945, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037795088969010803}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.586053500249237, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004722958800399445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.456069027426347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00467456255925281}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.47620688306939846, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036948499162037154}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 14.665585017062076, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.24236429155044314}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb70712c5c576f22626015ff11a2cc38c22efe09
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.06843022196095826, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016209829348703792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.3693465742479335, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00572904770734739}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.10846874891657471, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022660858526442344}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.021934695212110132, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008587513806860508}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.12551180235367884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003857834886276161}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.03520534486773339, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012605430557977252}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.0563802824192856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012612650769465242}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.3137283634535684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0047883163685496725}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.08984155494834113, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017851961263540854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.0599262486131966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014112607677852654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.3288265886334672, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005158374848086975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.09510390510913437, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00197951518364349}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.5771668227229001, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03949018213229336}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e49af4c0a289fd4defb7197e346a63db420c0212
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.44750599283357245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0061750153120256656}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5030371140620634, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005057579814946662}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.40672974088724956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004747719503261412}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.23121201106352332, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004554395505359171}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.26095223293350595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004203049627414673}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.20794763136088765, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003717449517818062}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.36379932365992146, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005316173565881487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4154788393144536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004521009665809461}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.32968419760104106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004005395669312728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.3878588296056201, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005564041350161236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4364619953127603, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004639975004724414}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.350222658819573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00419318437885468}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 4.943613794712205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11844944764051875}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..41b2c64f1562344b84f4f3cf5ccb017a24879ad1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.5520188411993677, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00601214288474237}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.548220949758813, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004819342758931737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.49290093244985994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004535226637927198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3120875667889668, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004983873820968777}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3067736586377568, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004384823596918038}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2745871334738363, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004011289790613376}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.44944454565402747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005387909108481333}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4525511017399319, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0045363314626869445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4014212538061013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004110491966627668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.4811824027219158, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005594588861653996}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.47956221956314604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045847302918650785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4284958019884404, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004213919043895169}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 8.417964438312604, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2643792221022832}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3830119360e7d2ce3feb80271adf98bd7705cd61
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.5928085610957852, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005517450633254952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5525764549057455, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004701547158523795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5252035299000158, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0042492415448978004}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3371412147682034, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00476635918343834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.31720395729691514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004415324262802716}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.29693925227774526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003968509948406998}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.474222482083209, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005000455875766691}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4498982913693952, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004517684638474564}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4214251230467327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003946378140362295}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5159206623402781, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005204009404461667}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4820510087427771, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004506991367679515}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.45579454499026173, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0039966392224786975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 11.337441125938529, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.35281351837475067}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1232ea148dfb121c78726ffc84ae0df0b0e8f6be
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6179161705510887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005179925027460375}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5499793848410689, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004719898662325513}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5392200875202074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004029829765978907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.35489114840169056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004672719377311766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.31710389747209167, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0043486372666120005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.30727811276404843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038965857969858637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.4928834334072748, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004801941083552381}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.44439662825960463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004493778056439419}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4305982476427426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038132986252883195}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5355960987340154, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004945683254307556}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.47715356656187047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004460972059827477}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.46597541865449993, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038235033593005787}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 13.517168068549186, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.46447484153448027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9da4b26fbaf3a45af36ade2318644107531232ca
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6306450054131605, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00501971293696801}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5530712542241596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004764483537044356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.547936351735332, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0039340919320462716}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3660341167515341, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004644221137903947}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.32179748545535564, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004381948513132989}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.3143304096356874, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003788714030250737}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5058246261062164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004683812001690324}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4489733936754188, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004527883655130444}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4395476708412179, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037047300276620886}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5476387430812724, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0048323373547494745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4809209755093611, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004512527660709306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4743263285062845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037242713323822315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 14.620474736449363, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.7629902022298345}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..af6cf795f104d88e1692befbf066954c73bf81e3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.10652121167086522, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023688690576375666}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.18229124220903334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036055900199667965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.12325351760686601, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024378361146446775}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.02403122038179163, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007699060973631696}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.043625038488324976, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001479846592084219}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.028234093188988292, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008538683596415171}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.07502743287846425, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016790640765481034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.13332814861106884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027047012209831054}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.08741178424542367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016840514722071915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.09866958650856938, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002213777292688482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.16898386767188542, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003353535963349856}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.11405814566073934, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022600674454096924}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 1.8920645866194166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10906475215438381}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..704c74e71aff2df2b4534fd2456a4327b1a62163
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.20625362062713593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002872702399736292}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2783423182358244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003090804568475943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.2065151225097691, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00213658012540372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.05273686788331137, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015443615818967447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06918059640276301, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00164036782101426}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.05042504357023753, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010841844819624794}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1506871830070282, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002247453578198224}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.20472914353418092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023795235115801183}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1494723652733573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015152057849824208}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.19197325780481556, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002708881855569751}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2588957282283766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028837734118927188}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19187799581291262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019884841459792014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.6453615405491733, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08715117333663348}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a54841fe9d23b1eb797cbaa8e7b83b2c39e2dbd9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.2604985230342208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003532579292360877}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.27042057106417866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029893664849675794}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.2215153917570656, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021128026916489443}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.07505187378193366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002143898427448883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.071229761133421, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016357845818527237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.05844382108884806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012190765682804957}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.19802942305185472, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029629197765882512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.20201919148745845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002320204372933796}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1647093031611208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015810334646662381}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.24329711860986517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033460280224748254}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.25211394848515684, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027986441657024356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.20635319833287152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001971460412273462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 3.443834214575318, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07662644696644708}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac95f76ec047de21f1b27ed6a35d7036cf19a056
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.24793362733863641, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004072705116113129}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.21602357804646766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003247971160753683}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.18909911450688263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002485261851347391}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.0757189416385469, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002438351512209196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.05889179745482193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015809058259419952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.052264080690175074, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001305053336431743}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.19350920887510484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034516414937821207}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.16335220863904687, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002527136592321546}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.14342019954697277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019335669853057015}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.23336686434028253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0039000769946596603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.20188401739948075, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003045495397045043}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.17698278189160158, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023368019527441448}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.9320240355527942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06400188830946264}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..71b2d529cff26cb38f0de7e1d041c30edde75fee
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.08505455985583703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032686912222570355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.06784754434173991, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002609991683396609}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.06018981225571402, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021308767674926973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.02558603416085643, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016206620665113888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.018640033822395298, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010708114501748317}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.016389358943220907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008517841588473893}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.06804946746366528, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027433941383577104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.0519878061323653, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002012096563564017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.046457274721451496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001657068676571334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.07991322250629485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003104723745220196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.0627062428611556, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024026682988274907}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.056048626197687586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001990429951365515}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.10422757463369249, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.016913708236514044}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf09605fc902919706c985c574f9ef2df57e0fa5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.014899886343061936, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001636426713859285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.010011528241871159, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001061768890010487}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.009433687410752581, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000950731356497763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.004817270390027373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008233657770553777}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.002766503050071671, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003904045601421101}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.002816474216950762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004045724170735749}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.012510249874248657, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014618089207318412}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.007744934006324679, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008222562130519892}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.007435953857664836, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007624386481813805}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.014274933245445069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015867990609173925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.009433722001459076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001003734017369036}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.008924051993279348, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009024291145168211}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.1661355655826006e-13, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.450117888401184e-12}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b94b558048d39a53a5f5fc44682c2548b096638
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.07027100884274905, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014521445356974571}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.10472445052112465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020798618023743585}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.07708183394727501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001463347467260908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.008562832254231845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004136142831151558}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.013863303125330673, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000729102085052192}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.009678964571703324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00045902080520941383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06026932981633865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011386395293241382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.09167546245478224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017633863007209104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.06658903938989344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011726928962648847}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.0657271406620227, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013418331595434927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.09851222709720396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001953574801462124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.07222873567668915, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001359766582637702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.505726128252773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0470776753818845}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..97518a56fb5f72ba40a804027d330344124bf328
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.11632215837726943, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022719726698542486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1089073980683278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018643903712033862}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.09649976388611473, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014734065656729948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.011877583800252254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009921470208561732}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.009499474155946874, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00063547866730146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.00821435177714014, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004985188271972035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.09352683015219185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019241784332484945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.08657411410201665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001449213254847429}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07633654428148366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011106535993015147}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.11156528958872643, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002174780318855094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.10428854893025001, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017563264576441672}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.09246968921921218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013963398060902452}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.7067381410576549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.050068101131539915}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..faad670010cc88fcfc6926d83500000502602955
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.16600399585233996, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003955264411797659}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.13361835608132602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002823518494999984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.11656166289615656, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022320874454571445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.04755537268642802, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023999815353559694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.03077587271042821, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011932156140592707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.027031284270892082, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009748921310219831}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.1375576289609542, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034657367963040514}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.10787166436393111, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002274225028197049}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.09362676385446607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017594678125178764}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.15740400069202692, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038134600407906997}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.1251306649415424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026365736489900736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.10952081998972399, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002101849995710418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.556112040416221, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08795854619057293}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..15e88eceeaaac88a5680600bc2aa66102c40ad27
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.19744399731543294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00445562488490846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.13957366973269725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029942900998473883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.12759653021086045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002413682977750019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.05894645060462109, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025387661262517916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.03675690679659171, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013469211911312032}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.03325713135121395, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001088176891614555}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.16313034967971393, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038938090385992356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.11198595815037796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024095154984473616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.10204884478099904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019050000519439114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.18602083735680533, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004280440688070132}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.12966848343967344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027716800385877054}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.1189102904059155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002250535740232956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.7264483443297305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05497724685084595}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b51f70345fa1c55993a4ee5a4e550633e4cb35af
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.0759239139617649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033259656106442337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.04900609435825128, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00217479262759098}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.045804729083272995, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018398695267236884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.02337045099123398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018175381983276736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.01330250450032572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009108445730813351}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.01233554242069137, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007539785982029479}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06433885009869143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002943755350067257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.039872298371084615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001773782906076355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.03730234130304816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001486354806413072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.07182753236028966, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031894464950767594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.04564084123760375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00201563203869237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04272052087689551, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001706024011899182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.03631236847231814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.005558768912443654}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..db3afda1c517969c33d8535b27c5305416782139
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.012958829476217265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015012434227656753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.008235611123812514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.000978389330203237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.00781487832891677, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008441904204964693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.004843771947683436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009481338099913783}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.00236491801694592, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00045623828249607127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.002188555672921467, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003407015183673761}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.010904268951568518, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013380932808541883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.006477885900035703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00078133822629954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.006226776578720049, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006793362081572629}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.012233303392934904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014413364413859053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.007708053571943523, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009228587153200308}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.007267764694797319, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007815243584570973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 4.269762783929057e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.518338645971512e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f451025de856d08d6d8fe1c64c90fee4f4ce9be0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.07075481728468239, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001896345738729238}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.06643817307325445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0017106553038965962}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.06017877876012817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013969700713652288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.008567992055791158, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004658658723066838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.008438860890352569, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005506053125992731}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0073466534048262444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003938081639864923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.0616207631889302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016585682800579703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.05781173371355003, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014446828709987923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.052169980061393784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00115977640796648}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.06858606377672731, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018515695822560211}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.06374381567280066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016154876653642737}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.058015717752569224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013371683684572618}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.30166616125136464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03299092473107825}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4090a473262bdb1cbd6f45febed3650bdb03ee20
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.1371808934348306, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002439319230759517}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.12109561626523216, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001793187416049064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.10986107432965571, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014492605599785123}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.014358452712872411, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009985415125916485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.010448529454438157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006381428033098405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.009793539257262535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005566732491126781}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.1105937798064453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020150560406063285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.09669600580268385, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0013796211767045082}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.0873337413737127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010847480607585381}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.13197826678166255, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023452181446837026}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.11668487016999766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017061887616042837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.10574341756058646, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013795771170144668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.7295859274519257, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05390866707359243}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e65f03a8c0e7fe1de61ff54891f639e4659462fe
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.22790730662467748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038619293949543852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.17039417678072985, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002623682838317647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.15896419199407144, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002110741679844575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.05835411226724025, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023625109773989625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.03655654766207606, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001311888413153822}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03477199950268131, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001148170515656098}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.18500655628018325, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003313643303697083}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.13521687887956313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020736922184739505}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12588142669948893, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001640871513930331}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.2169696601785429, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037256328539598173}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.16119268884778876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00246649624417531}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.15048882231569044, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019906565024388205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.1889717960368373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06741726194279306}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..96c9d91671cedf661eec262e2f9290f7b453634b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.22026933435692986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004226027941921998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.15168635082635815, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028519053998314037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1445324809151167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023513187047355505}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.06276445362278234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024682215594049096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.037991873791024235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001399765447448907}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.03606843530067028, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001164984602255506}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.17924178959744783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0035990136677962322}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.12047726091128781, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002275634505946444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1147650004577283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018510908050423905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.20873376387414452, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004046132323079285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.14298638872506758, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026905116077442096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.13635671547440492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022228611437271725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.8679229391280179, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10987316164693478}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..de3377471ef40c0b2cbff3334044028cad7cda2c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.07393725587462402, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032181295487344647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.046454406894951346, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002038618034193849}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.044918688464249894, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017947926321333124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.020800186309775436, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015975541170520002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.011520777401595593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008755975458470784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.011254692924376551, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007414468996376601}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.06182358864059859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028062754361036935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.037078468066000264, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016222637598800273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.03622514020157269, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014518847052011796}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.06969123514917439, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003062187931063394}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.043474473389648785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001906693067415681}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04201633885081397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016772951164631064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.023688527977254865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0035743692989931774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b681c34f8a74b282b1d7d9ed9e822c632d7be7b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.010706295526971374, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013089961975418759}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.006817958817895227, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008411163469920986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.006735634553796344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007809237197212106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0026189086956420245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005673572995320269}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0016413611270950422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003194112456571687}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.001543687945490394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00028832955644989586}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.008831988452309125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001131402041475174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.005517158220882632, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006948992894017861}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.005386364695009258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006350514269837313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.010075169657915205, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012404555583104564}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.006456627301202316, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008077762212667873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.0063447932832890455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007407891319077604}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.2748239955674293e-18, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.9738057634526036e-17}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b183ebf5d55a33163f85c435f1894ac06a81fe8e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.13048152009975786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029945327745025297}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.17865577403787464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033881835064667153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.13282629414914784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025194987202168846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03018125254771904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009701597241428515}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.044665240253023694, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001413086106707342}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.033116246346807675, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010039942081565677}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.10254296237637063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024976516791238405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1412543271300389, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026741023650489657}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.10296911367521405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018747375341668269}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.12215023147917263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028728571931844558}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1662465685102492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031683211934688273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.12355432651994205, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023550817743274908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.744610721855112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16526552066121028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c8edeae955fdc3aec0edff792ee5cb78605857f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.24565174971014234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037427166308021094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.21184490359364247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027978242331130445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18959676949532062, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022208786444120037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.064292697870659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00223313142118904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05044989353111514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00149356139860852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04555610212149096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012842639383539272}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19090785019208276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031096615547551243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.16233189417978816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021666821757260154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14459077408940324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016809419269475205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.23112890188861118, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003571687360942092}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.19926382429471512, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026476455852546383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1779721097148494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002089075825231665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.8487850954748324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07399043126258031}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..22ae6419212765e608368342e510f2441ab46678
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.3177459530757474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00400211201809437}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.23523949924863127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027923501651691642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2255393163759458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002203135213246267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.09785647917944716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002624157152992091}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06533504155625053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00156036864094978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06387789794595161, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013813734339407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.24936441373065246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034018286291394332}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.18082782092908864, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002197669620317046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17360298467727694, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017280099368334454}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.30084337998110033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038595850362637936}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.22147712923465016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026291853666447025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.21263985026820842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020905527432427407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.7186948773008828, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05546113136172655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4170107303cfcf5843b411bd0583ca54a8c338f2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2846746242566653, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004319925738331281}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1938876931146627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030463601923653946}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19252198096891418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025594189650328643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.086213919205008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025532621303566427}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05487592547354139, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015509508090563054}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.054477366161976716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013706432857531726}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22546828575157715, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036375567676432637}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15015796002510307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00240340751959009}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1494133193527796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020035425127221443}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2694034648043568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004158098981958554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1821584435675664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028693565561164385}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18112869368220622, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002417009296022553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.687306446962641, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0664578324203411}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f5021fe102fa2e320852f26c7d1848f611108b7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0960156493415235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003616614563173548}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.06015169027512941, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023503642149532013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06084731695924829, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002176933000207993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02926487891196175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018445908136531816}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.016390097756710017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010106857547182186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01686088972765567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009404003204068355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07801821469361046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00303848388489472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.04729130821614451, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018488779124782556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.048152202763597045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017262084858342547}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.09073940139372572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034537583670396933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.05607109134810165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0021842218596914128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05696906006963692, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020412224032817106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.04668959419048288, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.007153039873759097}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ff4e3d33e66b6aa69892cf5cfa6ec924ba822e7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.016572326495735878, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017108845418978154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.009927306068576757, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010319824087958868}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.01017010786697453, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009828813757946661}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.005362446572675429, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008727663926039747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.002846016962133478, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004240233213271509}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0029391304431817997, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004163042405148408}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.014042125577094597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015259273674767392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.008096887154439805, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000857985946484465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008341512107444696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000825303231689821}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.015986982643131028, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016701967608221248}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.009428039452194052, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009838146973353556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009670630558683124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009355088060122658}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.2603158162943564e-14, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.751733488101547e-13}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7dcf1d64fbfb32a8d4a6e08f60872b31b83a371
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.08141174878886664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001439622775683003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.11192550497068104, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0019100391613226038}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.08553702561971455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001355291261214151}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.007141249340697971, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003903230357924652}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.011016516206080998, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007193905687669391}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.007670667921800801, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004129088476854741}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.07117531662005762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011655127025012122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.09968282512781339, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016088854064618629}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.07528888534904114, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010842064028811886}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.0766197709818064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013526915170476849}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.10547363912496262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017901245298913224}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.08045200906728588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001263370693113323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.45919147551421197, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04515627730002599}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b160627ed118ea855fda5422345ce3e8c0c2c4a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11566837691197226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015353140127384068}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.11410379757796434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015344257802797116}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.10159577693048065, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011952248894988619}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.005802720511302557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00036268341953713836}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.005939805394665047, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004255365897056713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.005177050428898058, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003248271761339092}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09274207592681537, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011966610399347596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.09159973215721444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012088819142730058}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08095570189299443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008902535417571856}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11168791780519562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014653968202923253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1104796475888384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014798128235735315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.09821383093532979, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011443600495618714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.43632620722433213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03652700409738095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4aae56ae2ac54ffb564b5ee6d93c586da4a55a77
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.14601422897116828, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023528068041928623}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.17757421521329586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025858960704315853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.13853002039888562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018231710614927264}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.02335655461073883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001024065445599411}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.028259252819261033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001110223234196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.021415156800516547, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007744432940768205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.11201526766555471, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018577920473740726}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.13622889891032916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019324235004749345}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10498997106963696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012843653383884555}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.13843833207278688, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022359405028341195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1677188214159601, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024133512054743036}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.13100826999001244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017055637389061106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.390261470828549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06598656718700266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c49bc810756f9507cf78648c580a738580a2a168
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.13285396611786718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030209658760757827}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.1422872983892613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002878644422451723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11404753732090826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021481783154813357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.02798134561881831, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001386829306669423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.028032889930510667, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011691508536939195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.02209148162686031, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008417227606383286}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.10397480499710599, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002432028827552923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.11068982956126656, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022301779950906378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08774826412280444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015918396074012017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.1253632873485129, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002862010683878808}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.1339058808413107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002703503542054378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10726187500665701, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002007303392628382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.465485886708219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09344145038532585}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3e427bd5e8612632c714e211310658062169690
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.039486863977905025, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021598107276160857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.03691619467473134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001837241505842736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.03077167379460974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014760361121106862}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.010893541084177573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010824871341449053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.008219229554088426, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006725668226973433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.007149055705615344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005554564096594137}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.03222582701764484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001833931334057099}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.029219757580315777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014381541985788997}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.02433445892307545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001153157619824516}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.03741092237605496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020645150808717167}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.034636931128603624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017203814565781976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.028933058276345378, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013845885404859834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.022392125697331136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0036463991883077577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1aafe2747381f45439a03b78f965f0fd2e0abe1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.005198934009239172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008983856883541296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.004343154687994872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006911835975220482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.003726273969867652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005744387153012822}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.001571113975338106, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005052873938474259}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0009049370506867134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00021201763153043962}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0008390948687705988, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00019938162808952095}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.004366896062808353, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008005747263531927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.003439980473577673, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000538456144374977}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0030007686487377285, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004713098658540184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.0049827592266209475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008729306567735783}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.004153089344111274, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006636613143429368}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0035454836584688973, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005472430761164219}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.7915004290523795e-24, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.569603854109237e-21}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a195f4816fcf944c7dd78fd015ab1991430ea5b3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.369, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015266698139154615}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c57019009ec80cbc22f1fb7c2524c1ba7a4665c0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620337}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4969c8990af6e4d092a80fca50b393426e6e779c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.356, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015149042659306623}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.361, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015195720118175111}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0b50ebaee4cb9b6e1ec45213ee3f3a645db70b4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055235}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.36, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015186527932040115}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ecc16b2b2553c27ce2d3eda1d5a9bac6bc2a4aa6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.346, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01505026612756443}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.37, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015275252316519359}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3db4e86a557334ef9a4dec7f22e2b919d2e79919
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.35, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015090650341444236}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.366, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015240612726405747}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bdd3e7fb8e3c79b30d3b0a4e8ebcf5bc47b2f74
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732954}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.309, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5fc45b3fcbeea3e8be78166194726b04a378589
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..337898d90fd6251daeac775f460ebdae0612ed57
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.362, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0152048409129195}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.347, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01506047203170662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1118da9dd3689f8140516e291bf88898a3152916
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.348, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015070604603768408}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a66ec87907281dfca368d980a51fc6ec7b45d64e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.341, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402706}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.339, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01497675877162034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..66eba3f7a7f954328cd814dd0b4a40c2f166e016
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229868}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7ae8e67cbc33ec4dccc7c023e96f5b9f1793c2e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.366, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01524061272640575}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..43b50d63b17fa9b72aef050f5156b7c081ca82e9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e39bb72da8a7917b03a0da13b9096719ee37697
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402704}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795023}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..44e5a4d480f5479d7e7dabe4df1a26840f533961
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.349, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015080663991563102}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.346, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd7b59da722cdb21ae20cbc8cc1e343f4bf73b15
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.347, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01506047203170662}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ae02d0a121734fe827405d9048c9a15e0099f44
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095522}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc82e22d7b7773b74c23e6b698683f855a5e3430
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229859}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb4a0e7b2a6e1d577b9f3f1c8b7ef3ad049a2c2b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811483}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811483}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..86cec473d008debd892eb77b66e53036d1dda118
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.346, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015050266127564446}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.338, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01496596071022448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d47c525ad5ccf591fb3895e18618b7f9685f621
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.327, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411237}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.337, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..05c0f27c9e2d77b2f23c157b2e9cfe6af69fbf79
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732963}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5fc0fcdaf96d170522f000726875078f3fdf87e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996673}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996674}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b576eb84d5611d16a4598b2f2bac874d85cdb78
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.357, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015158521721486769}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab2b2d5aa87a0f15b6257742718e803e0295d59b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cd39cf99a6498204a68da51fc4586cd653f9fe1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203931}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a74be4a8dd0c0f02e5de78ef620f0c43d5751e94
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121728}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.344, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..519785dc18fb6fdf3b502b10733b0e2952268a74
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811478}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411244}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..67dfd871d94abc353a7982add0e661437ee0041f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01480686473373886}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0149550879186536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1235faa71c35a82e7cee55248a3784ee44178e33
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01494414023379502}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.354, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015129868238451772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..79df4c847947d4d4a118111d56c26e46d982ee22
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01493311749093258}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01487687202745673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e800fab494d9da519438a9d9478b0ff6c8d84687
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01499813134840272}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620335}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..68f112469f7b4f7bd2f92b45e4401910567f0c61
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.329, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928376}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.324, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e45291ee1e9ab6f9be782a58c2f3b6ff4ad7dfb4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.344, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01502963372440895}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.353, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015120172605483706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae53bc34e6b704245e2ea05c0325b18ca95f6783
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01497675877162034}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.346, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8061c93b07c0d96fd8aa1d652c01df53faaa56a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.334, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732958}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653598}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e5bb03e14a4fb73c90306ab350843eaa795e7eb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..17e8a853fb774e0c79330e6bccd03439ba892e40
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.318, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792508}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a33c4fcc93537c1d59d7d8cd0d2e1d6dcdb9636
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.328, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270334}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.325, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3c0419cef2a05e2847cf361e2a2ca4d26d8dcac
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.304, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014553205687950425}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.323, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..302eddd196a0cf414b5309d6f75662af9b134e60
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.308, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014606483127342763}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.316, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01470919305605712}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5a47c5263a23b76b31f1e981fe6e482e4b24d96
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541045}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732965}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1ef862fec1ff2e2138a2a82eeb4e377219e20bf
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..074a06c01f0221f09c013879b26f4b0fd036ef1a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01483050720454103}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01487687202745673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..93e83c5eaa84d277830bc501fe41ce7fb8b56bd1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356951}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015019206922356951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab0478efabdc1518a57f69813e976495ec9414ec
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057128}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..690373902a2641c02a66ceb428e37b8191270648
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411245}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014955087918653602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..81ef4520277b4815bc03b9603da1850c8a2d46d2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.327, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411247}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.303, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014539683710535267}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d909baabe5cde5f16c85e25c14d5884a043f39cc
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..84ff9db9be7ebd2395aa92b7e3395f2e18544b75
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348633}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996674}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..038737b3a3b08cb69d742e2c58c79d253bd02062
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.321, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014770821817934645}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..993fe8ae0abb650227e139932d1075119cdd1c39
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.324, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738857}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0ee31d90de4c4a221aed611cebae2a05f555519
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541042}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.321, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01477082181793464}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b78b38568ec593f49603515fc8ab39d8c897fd0a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01493311749093257}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3df9e64f593ec0b3e856eab172dc08c515b7443c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9758205cb484702fb936a2a17108aff358bf4637
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738863}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfc2b02765e8e24a6b21117f8f2c2bce20278e38
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356953}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.339, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620342}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a1ab71e64c73027c01be0f482805766f6936bf0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811483}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402709}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..52c382ca031a290797da2061e9cc88699ffc1192
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.327, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014842213153411245}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f77e9847250336b64b0cb655cd5cb002ddc1b165
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618268}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33166666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013596836729485166}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..230096503a2c78a71a3498e178f5e5bff1295814
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3475, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013751753243291852}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013728421539454885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2d19a0aaea305f5760495c0f7162bf809e7d767
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013605417345710526}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33916666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013672343491681815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..19dfa50ef562785253f739d3a6bc0d4c660dbe1e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.35083333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013782212417178199}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013696658778002515}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a417bc7981208da8f0d5e3a52edd2058d3fdc27a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.34833333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013759437498874072}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406394}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..639fb76956a8810abb8769edd4eba95c5750f603
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821476}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013570806258433623}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..793281eb3dd3a8c9de53435a3f65f7ed38ae43f3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821476}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3175, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013443538681348052}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae9758b5838e4fde9bc6288c98685fffcb47bd23
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2d57594757c4deb289a1e44bff705e96d2f4080
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.325, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013526454480351021}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3275, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013553211167251951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..328879398c82411fa744fc27e0b5c74929072469
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31916666666666665, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013462309712005134}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3225, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103247}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2474dff0275f78eb2b91a1a5e3ae9b5bf0f89266
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3275, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251946}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6dcbfa2639e7c2ea54708860a79b2c0c61ec16d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31333333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013395739415639082}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01349009528298952}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b34d09bf8a33c03fa38de2f1a1c87d74afd86043
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3425, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013704669762934732}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3408333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013688600793296934}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..246377163d56fba01fa2e58c928e053d06051b54
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3beabbf0cee60edb9c0ee6a4cd00ce2a55341ff
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.30833333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013336721143136469}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3075, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013326707242912048}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ada57ecb62403a977f7e079306a375b6d4b725db
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013452948996996289}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.31666666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013434078660827378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0371ecab38aeae0f15fc3f83755aed9b2a1694c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32166666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013490095282989521}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01347162092976915}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..973868118333d8b43529131a275c203227d744bd
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003665}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013553211167251951}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..309dc8ff82e8be41593f9c16b0d8d37e63e3c0c4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.30833333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013336721143136467}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013462309712005127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f6779777e6cb851a981460a449ed44bf7f316c6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4feecb765a7e9cf4eaca605708095abf7621f47c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32416666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013517438120881636}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f8a8a532235461d289f0fab1dfba6591fa0b805
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013508372867300215}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01352645448035102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e03424394e9fecafe284a90cfc924a744df8105
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013526454480351011}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3233333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013508372867300217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a46a781fa55941f35e28d7a42890e257b2ca5624
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932886}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3358333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013639261190932886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9a7bc6dca36cbcb9a64b947655afeb45856a53d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3475, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013751753243291854}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33916666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013672343491681817}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6eb8660851f6b4a8aaeabd3aba1b7bc56fec3f2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8f5c76ff89a57d9804ec50bfcdf0821982c5e23
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.30916666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013346684134591945}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.30833333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01333672114313647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..11181e33d50cf0ff2a942e0599f165950c4cebab
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3175, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013443538681348054}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3233333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013508372867300215}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f790a2bc0b79f9c879642c20fb378056773a95c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.32083333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013480882752851553}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..43b6c4717efed55b34ccfecc095dfdc608f77136
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3308333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013588208070708997}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..430d6b9f135c7a3a793bc19f5fe24293bb7dd03f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012288926760890793}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012288926760890793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c405682535c4b3400669a7aa00378cfc02837197
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012682496334042958}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25170648464163825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012682496334042958}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..732c203e1b04003355dd194902f3b616ec03f082
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012624912868089755}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012624912868089755}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..78b171aebc3e8eb9b6ec482f1c4187b7cf15bad4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012352507042617396}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012352507042617396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3600e5536462ad03e46c64cdc6aec5ffc87e7200
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01250656483973943}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01250656483973943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5723db2c9e045256e61fb4d0877115e58303f1f1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012476304127453961}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012476304127453961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f027b4045a9ad72e67df77a28d0d28453526b64
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2636518771331058, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012875929151297058}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30716723549488056, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013481034054980945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..194e37809e28cdbfa989c5f4ba6911efc280694b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012902554762313962}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30631399317406144, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013470584417276511}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bed0f4113d09be5093d72b32557ff1bf9f842543
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26621160409556316, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01291577478152322}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.302901023890785, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013428241573185349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..426c527e5022ec012a74a78f88de2e782ac8647d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012942030195136423}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2977815699658703, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013363080107244489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f03a743ca9e378b642dcb326367a001dedf5f397
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2645051194539249, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012889272949313368}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2986348122866894, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013374078615068754}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a3378b17c0d18d60d9a32cdc40fa40dcfd2ecd8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012696728980207706}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2935153583617747, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013307250444941129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5158d2e86035112f0f9d192562640ed133517bd7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012445770028026206}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2764505119453925, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013069662474252427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..63028d2e3348e5e77b0a7733766090db42edc2b9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01230492841874761}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012639407111926439}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e6199573c43cbced89f5cffe87bae1a78a7fbb7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2440273037542662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012551447627856257}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012610352663292673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c5cfeb7dd66c36f45aa17b77d391f5f1add7c87
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301839}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012862523175351335}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..09982e50ebb89285b1615b3ed05de578664accf5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012610352663292673}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2713310580204778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012993807727545792}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba78e3e64613fbf5486b5f77e55ec597fa11e810
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01232085883477228}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01266819862131543}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e498c60699dc8bc746233209fd04527a0645bee3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012272853582540799}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012272853582540799}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b75393d0644abc49ec6785c2d4217a36c83527c2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587087}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012536554144587087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6ce13371ba04a22e6232cbb178d614f877695d0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012536554144587089}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2431740614334471, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012536554144587089}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fd2376c3abbae77d1e795eba6b339230846f042
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012414960524301842}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012414960524301842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..43960835cfc91d0a212dc25782a3e115c86f891b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012653835621466646}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012653835621466646}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7ab5ca02a3145780093b7b739eb527b6b945423
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01261035266329267}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01261035266329267}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..359df79f5fef1a9e679499cd96fe0826ad1912bd
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2551194539249147, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012739038695202105}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.31569965870307165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013582571095815293}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..231ef9415251b1a3e8686039b08737f60e1a499b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012862523175351333}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30119453924914674, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013406741767847624}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ca2add47a9cd623ffb693216ded6c5ba72291a8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2619453924914676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012849054826858114}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2909556313993174, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013273077865907581}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..32fa1c7156cc7aad778d3a627082d9d850be058d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01278077056276841}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2815699658703072, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013143376735009014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0c366c736201d73cb9bd4d2f84db59e2615a1f1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2636518771331058, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012875929151297058}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2883959044368601, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01323839442242816}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c288a2613f9f8b5d2db5c563f545a9d3c44d8a2d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012639407111926433}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28924914675767915, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013250012579393443}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dd1cdb9ee0d2e5d94eb9dd9d4f7b2cc32fc8bb3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2537878787878788, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008929657065808292}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2537878787878788, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008929657065808292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5f8a65ae51f5535558112295b560cd6847e1658
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.234006734006734, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008687500578023184}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.234006734006734, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008687500578023184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fda17d589a76e8e17989e370f1734a468b480f33
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008914948991495704}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008914948991495704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e40ea285b393064614901243a0c950ba9ffb99ac
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.257996632996633, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008977970005203404}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.257996632996633, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008977970005203404}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..deae2c4ea36330970234a827bb7d36a1f77134b9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25547138047138046, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008949113551665567}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25547138047138046, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008949113551665567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8125c8b47a918e821fe4d104347d7cbbfda5c1f4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25673400673400676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008963590834042409}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25673400673400676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008963590834042409}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0b53ef6bc79b158ca5e59c4d407cbeecd942f95
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.36153198653198654, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00985850654316206}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.31986531986531985, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009570821820573587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a742baadf1eb24e59bc0ca0748531143376f6173
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.32996632996632996, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00964831157424104}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3143939393939394, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009526702423162905}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eae3625626ab5f8edc33f032ab50d35938f5767d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.32954545454545453, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009645184190953856}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30387205387205385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009437524848293738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb7c12b1e25741da1cf5937b23ec14d1530a14a2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3228114478114478, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00959395022036674}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3055555555555556, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009452181213593461}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fb7d1a929333fb8de7bd98b20feb9f0c08f5ae8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3135521885521885, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009519779157242258}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2988215488215488, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009392656275408728}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4d80eb7a4df417b5cceb07533b736049b83a13a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.31397306397306396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00952324533521551}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30008417508417506, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00940400055851335}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b49da60fd7ef75a7d21190d518f830f5365fd17e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.28535353535353536, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009266280584997748}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26346801346801346, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00903915737449771}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80b7c73f22b5da7ea48ca6f9da930c9058027d4c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.289983164983165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009310840970769035}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.29713804713804715, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009377397867796849}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..85440d4a6d24552de75f6525be34388ad243b065
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.30513468013468015, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009448531094163912}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.30303030303030304, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009430140669278962}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6f2fa3d38c43e7fc25fb221018817a587707e93
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.30008417508417506, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009404000558513356}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.29503367003367004, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009358110551087423}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ab064ad765e94c37fcdec8c0c0029e5e6350568
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2904040404040404, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009314833302936285}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.28703703703703703, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009282621598983073}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef622f85b8c74d367eefb6d7b6164e9971507d05
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2840909090909091, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009253921261885768}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2887205387205387, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009298805565435513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a2904d98e7e20a6b6bb310b200c00f00542b7ce
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008910024163218191}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008910024163218191}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ca9cc31a39aaa818d05ee1f53c943e22cc007d3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2361111111111111, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008714480491711288}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2361111111111111, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008714480491711288}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4cac3128c85a9f7fb9f3c1eff1fbb0631b3be8b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25336700336700335, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00892476542452926}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25336700336700335, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00892476542452926}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..18ee7e8b1fcc9bd141c430788266e8fd62b7a27c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008885233166386385}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008885233166386385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..232a99aa87ca6d83df68fee5ce70a491037b9d82
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008900141191221646}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008900141191221646}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b112fd7df56296f06a2a475c658406026f7a5bb6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.257996632996633, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008977970005203402}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.257996632996633, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008977970005203402}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ccaf869dcc9856e980093b901fd185a4fbe441c7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3371212121212121, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009700146509130073}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.3181818181818182, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009557408782506372}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6487b39c41e6018b372b0705fa636b06b7951ea
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.32323232323232326, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00959721864204534}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30976430976430974, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00948817285190372}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd8a62c943364de516c91c8e4ad2393214015883
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31313131313131315, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00951630387930953}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30303030303030304, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009430140669278953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2326580223a9b87c89b760ce29c748e81135d60b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3194444444444444, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009567482017268083}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30345117845117847, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009433837434252272}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4092572788b13ae083d20bbef030347b16f78c1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31397306397306396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00952324533521551}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2967171717171717, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009373559492986842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c1675a72f1cbd917353da5b35b017db6d769c49
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_arc_easy_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30765993265993263, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009470292575831185}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29713804713804715, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009377397867796849}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c15681268520d793cb2b3afe5b246f8cf46d8af
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.6006666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00894326942995515}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.627, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008830798057449147}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..946be1deac1660aaf819069400d6a30313808453
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5813333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009008626314760201}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6046666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008927944837940472}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3a1762bffb380fdcb5a022a75d82ab4fb4de0e5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.571, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009037711366393888}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6023333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00893695992571691}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f76f93e01dfff9ae8917f316c83db25d0839743d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5713333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009036836097555085}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5993333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008948239303079452}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..22cf1af670a8ae3de2ee58524cbcceb5085f1c3b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5693333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00904202497793108}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5963333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008959169522662576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8449f19de4c7729de552b040df92d5232eaa1999
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.553, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009078792586293545}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.59, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008981103499757514}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..58a0a8d4f867b57987b6faa784d5022aac3d1138
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.564, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009055127374988174}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f194ee7f55dde36983afb2b88d5dd2b8a5e7b50
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.541, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099483512819305}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6ee188877828973056ca23e38eca31d9eb0f1c5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5513333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0090819853069321}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5443333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00909427038138736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ccac9aa6bb862b01d74862fe91aeb86614e1a6f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5733333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009031496556538196}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5623333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00905900327659221}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2715d93292a0725be9d7bfd067cffc0e5326bad7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5783333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00901748678876912}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.573, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009032396953831096}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5ad61eb1eb0ba6a05872f06c877718ab9b092c8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_after_reading_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5756666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009025076316539064}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.565, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009052751926300881}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..07710d57d6eba25289ee6f30d319657723de99b1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6203333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008861873799148995}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9dad0e8d207e924cef5e7f168097bd0db158f47
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5433333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009095877403306732}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6d01e49fa10717cb5d2894520ce7c17bd805a6b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5436666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009095345834327867}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5353333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009107405418833935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7c16dcfdd409fe31d9439982322186f14940a4e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5386666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009102888762598252}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5356666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009106972161130876}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..822b78b2f505818e24ad4d076229a36dc06b0e66
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5473333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009089227499483247}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5346666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009108259669413834}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bec0bfa6bbdd6517a6b4d6c7863d346050c2c48e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_exercise_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.55, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009084465266030923}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5503333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009083851457629933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..12f33969712c417dd19d4771dfb32dc7005cb69e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.622, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00885427200344005}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.53, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009113781890088806}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..23deea49510e8cf2c1902fc344f377a2bd94acd6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5413333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009098980657278164}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.54, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009100967487199725}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f353ad643b90b14b022d86d5c552144a8c722e10
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.538, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009103824830376474}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5343333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009108680663441197}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0e1c3f20e1bf4f57e675e859c6ff1c54e3c4c69
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.557, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009070708861664755}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.542, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009097962646004983}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..959bad5d643f7e11e2fe57bc8e93141eb6b15ad3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5636666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009055910870388479}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5516666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009081355012045532}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4cb47b11cd3f11524931b9efee02ec941519fce
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_valid_binary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.561, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009062029213030572}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.538, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009103824830376474}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fabcbda5515e5c04b5ce21b119975ac1e4a6ac4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6233333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884811049411477}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..edf8c6dd7bf3bde00e6f05f16a234af6b11112d1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.541, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099483512819305}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c91ba13be732c1d9c9744e3620af40cacee3c130
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5673333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00904706345689798}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5836666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00900149831714761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ac70bfa0ecbacc1bd72b1fa33e098835ae78e26
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5686666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009043721169619542}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5846666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008998379972670814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8259ec88fed48d25c8d9adf3133303a58618f840
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5523333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009080082050148014}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.578, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009018450207660424}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7095a4c4b2e0b3211f97403310cf6b5a834b2ac3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_boolq_yes_no_question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5456666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00909207019506541}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5643333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009054339719898379}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dea9eac31aa201fb8c0af323a38d9ba176279b3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdc4831754ddf5a3f32059fc52aa3a5bc7ae8fc6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2653673163418291, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..05ca0e90c6d718eb3a283dd898a6d3b42cb3e66f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.29992630803242454, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..07ef29a2210f6c118645bdf3a704734e833e887c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2861685214626391, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8ce2eb137fab776058b6a12147b54f0d35f0fa1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.27619047619047615, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3787c52df209beb31e2466cff08ba76c655ada8f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.26430976430976433, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5937e65c619a603a2b1ddf6671f3e7ce5de180be
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..808c83781d9a65ce3c641a93a431ef78af590be4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b794218b12f8d11702f72aebc99b5acfe6ab4666
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3349371825715049, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e31b3f7984e5060a62e2fa5f8f77c3da448f4569
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2690727699530517, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cc5f923f15591bd44224ba9742486ffc214c2e9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.29072681704260656, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..67d61b5011ca9817c8f7dc4a924c337321063baf
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.24773139745916514, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..052037d362702878d8fba138abb48bbf4012f232
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.2857142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06091449038731725}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.20334059549745823, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..89e6d332e582a5b35247e8edb9ebcf37e8ff1249
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0d6d29e8e0d2607226ee214e6e237fb6bfbafdc
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.33948717948717944, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..87c93c15085ee7ac88cf6690e8dfffeb0511db9d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5178571428571429, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06737697508644647}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.35185185185185186, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6cbaff5f5f19eb6eeeab27d86532805c3eed993
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.31111111111111106, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8114beac072cd39782220903fe0af20fba63e829
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.29060665362035226, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7be8cecd2fe1475c27485401a5c2e03a0fcf6883
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2195121951219512, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9636ece6bebccc82a994811423e3f515f8c5153f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.34556087187666135, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f82e5e6822af32911f7d2a4beb0d29d69a5705ca
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.29448329448329447, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cb63e764d239476d16e7e4dfcef4758e967804a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2330246913580247, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a640aa69ecce6ae2a586c11c8c9f588b1165d21
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.5178571428571429, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06737697508644647}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.25267737617135205, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cf591f25bd722770b9c8e3cfdb5c0c5e24ede71
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.5178571428571429, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06737697508644647}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.25267737617135205, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c047e38bce272b314329dd8bb0ab8a02cd2f0eb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.2857142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06091449038731725}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2064336372847011, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..574805902916d9043d0b5aeb65955246c960a5ac
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9462e3161b99c06996037384e84a77a466252e0d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3024109014675052, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1c4428e101484c6296207fc696e1290e2136d6b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.34175084175084175, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..117d71255f1c60d9cb1af9b87e57cd31a23da03a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3263888888888889, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..52278fcd8015a623723545faaa9ae186ac3ee18f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_cb_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2931392931392931, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd8dbc06dccc4699783cd831329930f1abb0115a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dea68d0595ef6d10a8040f0a1502cc3b8404979f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8da1e0eabfaac081fe2960291a6e1db18ca4935e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620333}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c444b301e66c8b78e9b92cb78e61f52f0f17c2fb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c5dc9315620ff2797e9c8e35706f0487b6c3c6a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e8146bfde7cab2e0f61472ffa568d462647ea48
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_best_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a78bccef69ffc60037d751a002c07cddcee3a86c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3da25987951be42d7105ac1cc049058ebf7e1c87
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.49, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956912}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd1506f4af39031a0f397afe1a11f33d1fa0d60a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..651c3904043d54907919e2d7e281c84270396555
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6184410da6a6622aa057f14d22d3d9c465d5ff5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..53cd23d208b1d426ac571a9dcc7718134cc94e20
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_cause_effect_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e128b29f7dbaf4a094c4843dcdef6601eea4189c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f847b7137b1626c3d30512e158d36433ecd21d48
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048523658709390974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f32984da787277b1f33939808f0e742ee6545814
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048783173121456316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e792d335b0673f00b27434063e2420cec18bdbbb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..470e205b99ca89320a97025fabcb0aadfef3ec2d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4049202d731c3615e4c02ffeffbe3e8f57f43cc
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_choose_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c1563f27ba8c5f99b633a31edc4e0d9829928dd
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.61, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049999999999999996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa75e9a13935d0d1d18955f5307bb37ffc9d634d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd9bec4a9b48021405381207d00697b36898e53f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a192dc63f22446c6ec5927d828a708fc0a8d3a01
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad8d3b6bcaa6197080ecdc094f6e1a4e70a01a13
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..73d118f7dcddef4bac74f7eaf794dc9e3fdc938c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..670915b0cedce3488475e77675b9c6809c468556
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050211673156867795}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cefb455d457963b0d5cb36517ac6f39d71fc1e6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050251890762960605}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04852365870939099}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f685391c0a1db3f58a68e03c67efe6f76b4b372
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a63a1eaaf610553ecaac0d7bd23b14ee4ab0b343
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9521faea8a6715936c01661ac4768917cd8766ec
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5aaf0a793e9bf2f5a884fa0eca2e7946f2d5f1c5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e12ba8099e5747a19b1924a7ba77335c09521477
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 3.0195237412009357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06083199996700688}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.1801203066264519, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002720982526713175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.2755817289379272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003631624010853788}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.20972323261781503, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028606817573562354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.0667960442666357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014395935463214797}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.10315461019309466, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002160644851825999}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.07799688762882317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016150990465315606}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.1408194558913243, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019537355788883993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.22086380856877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002913704901414896}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.16577291549436218, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021511059705423764}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.15471647764935745, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023187171223427163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.23726074831351426, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003123412402969155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.18010845014611573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002424941965882981}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cdd2a207c6cb9dd681ba3d4f46198d2868dee21
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 11.453380248459945, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18516188812979686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5373318783599824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003917154984030972}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.42172162556828374, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003363645302615997}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4476728484979188, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029280576089271435}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.25820630689480006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002905622919692422}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.19826349640259594, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00228926623954119}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2113574436167878, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002207577446796678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.39371548089187824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034038731394905224}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.30546374911268226, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027166850788240935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.32549231149234764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002487047300178349}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4422536962941675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036803390199736026}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3448887088986636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030186579678220505}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3670610104063968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002755900373058885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2984f0cd05b15ef89b65cd4005c71dc8b6a7a607
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.50498117442687, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.199944172481821}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5887790541929973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003262651099040029}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4787709415006632, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028961391036440892}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.5033284407510777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022482442873726256}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.29848800947384685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028146889310476536}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2386615975453258, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00227464540650362}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.25126695886715306, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021103093049905787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4335431846872657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003041932502672048}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3504156877121987, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025130404124840485}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3688360845444858, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002144196545004686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4889683087018754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003238205206471699}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.396234345126879, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002752773124023653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.41705421018227823, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023359077868479123}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..db87797a88dadbfabcdd1f85203f1c9cd1954a7e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 15.52122027752411, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12178399428274998}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5943056234570641, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031613114217466545}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.49080579235224514, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002887432702643965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.5138185171536801, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022075774721638454}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.3035973599869196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00270545738266106}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.24895866810157455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002353494705639878}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2600892788255361, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021258730379320663}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.43863053675479546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029800420206917526}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.36085145822203146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025572884098848765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3778880532609749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021589962574373165}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4956533881700253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031521759054215863}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.40860648474620076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027644229003361666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.42804012925193735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002324183115712361}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4639666231ab854ec3cffa9feae233a1c4cba2c9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 15.80040151989149, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16820667809122108}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5960169570684771, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003153282877006394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4956677150438472, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028826068990134264}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.5188871554976948, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022652768418355116}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.30558156632064254, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002693196777521595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2524058554563755, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002376081098564734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.26380022955746174, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021699688555991024}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4397322331548957, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002948840127314331}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3648882670279415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025648519457164916}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3819640735718326, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022089567195582877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4985280260096084, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031611778556326644}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.41435411543293943, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002812123109588658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.4337762791584858, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002402885949071798}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2f53a608f643f2ddc3a83a8e44575ec9a0c5c78
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 15.975544621705051, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1579004448764518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5947966548185337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032050395613254807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.49593447472794544, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002883113398307053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.5188499671667609, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002275638577386502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.3062471081766795, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027809428178784284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.25287403669722214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023773224583267306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.26453287785708496, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002209135453833924}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4404441060880683, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00301839351177247}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3662472446966816, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002583976795011767}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3833627449905156, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022528184957952757}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4994143702924253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003219053679193333}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.41531212344400226, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027683691583775144}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.4350608743780979, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00240185780147281}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c80ec4a670b2300ee028cf2df39e2bc4317a3e6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 2.1585732992390034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.057283755545401806}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.2545704766225674, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002778559965844538}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.40955804247489414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034632388293561835}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.30605314016841034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00295652774844176}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.07997732391252783, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013066533136759076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.1258719249085403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019245060999262846}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.0953144635435075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014784395007114161}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.16837333279413402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015437698900107716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.2821392548746015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021460486676796752}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.20529901859047706, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016623332234901712}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.199053080158595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021506096653628216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.32312955789441217, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027345035801367985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.2399648342595197, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022887681185580016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..21312fd00e2e479da80f748ee20b9b71d31796fe
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 11.125306912232396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17790295386633892}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5569688477314088, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037048052877115076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.42404460903081953, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003244565701809299}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4560439580303458, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027412591881031878}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2613191609571208, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002833212806742642}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.19573012562121292, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022433262821983796}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.21079598546546485, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021486250857995085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.405669026220581, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003271384812666787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3052491335790724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026212753055562302}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.32935726670214416, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002340422818555527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4548387827219991, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003512840404754266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.34412912946816115, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029023101925979514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3708772964846722, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025874295501989186}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f8642a919676105a057b91642c0e7925a1a47d3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 13.813032450428782, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09230276426798453}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.6032650507763088, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003318432943559324}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4672304478737013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00294371456779212}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.5006076709725926, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022915149769133685}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.30416572111597057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002857287549126892}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23167563766203414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022815742048801517}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.24840794312750028, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002115487840852602}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4456942993345726, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003132022239080753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.34237578293418114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025028374419928507}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.36762233413078943, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021522899608404623}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4986399586926029, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032916563071556525}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.38468678128109907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027331843934643553}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.4126125012797959, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002314858859168761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..22686de97f28216ad22c24f1d701a7b9cbc16ae7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.483672743525295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11929516126568702}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.6038758728327503, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032028637360418007}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4744733062907945, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002876028623643228}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.5075591407090586, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022251762998982765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.3055894171560721, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027886849751003356}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23717581869311993, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023052901683455382}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.25378239680253595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021425356705422195}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4460832287476524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030259872274363115}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.34878741185181833, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002498911308943595}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.37357349751082325, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021616271893979125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.5004728706993896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031965878124048455}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.392287327193044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002719219158202565}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.42012395813675424, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023314918549654064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..27c24f82d9ac76919c0fd0b93ff85eabaf03ff95
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.929762015804371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12898047247308805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.6022142558158895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031609014593905324}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.47984349113713315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028797812370297167}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.5110531953195654, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022411307696570434}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.3076095968625754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002798498179123258}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.24178364515600512, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002329282560521692}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.25768372768308784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021668151991038258}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.445074501071273, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003000311479460939}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.35356822619623607, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025548952867894027}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3767138926335197, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002205434337178266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.5011709897391451, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031888091946731143}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.39903846221733896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002793611393672377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.42503387255872077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023845126436349852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d4b303bd039fccc3aac4f4382d1030828922a3c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 15.223644297182389, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1543953907459507}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.6015027603881121, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031982719234700313}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4820792569040302, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028725797677984985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.5126128110763807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002267700753729786}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.3075111272260856, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002775456152522445}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.24398746828269524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023580828733568196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2593212430045562, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021871613636693333}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4448480337595403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030234611940094993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3549826197897851, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002543994777103483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.37783699954395017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022179761130291394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.5017155875511411, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003220280562846132}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.4009107110812699, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002754771618482513}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.4268497329572666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002388843734324405}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f435b8008be67421d6bb1bcc152ec68d97cd0c8e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 1.954444748954989, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.051868039637637996}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.13769122808003648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017596998810661165}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.10639231990905271, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013988711766301635}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.11526409537525575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013631415982193285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.07169075124195784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011707222728807847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.058024124805966824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001013674848977092}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.061308190194015844, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009528331425054534}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.13598668737421912, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017358845797809722}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.10496985768149185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0013804037318383363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.11375684850778682, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013428955348118651}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.12448022797409529, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016204095898696496}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.09733342412728097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013323499351542672}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.10487679476333724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001278747634008628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ed6fc17c4ff17c0d5a1c8849beaec4e373156a6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 5.689076651457708, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10955679808497602}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.30205344791644223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038011993380253984}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.2591489728142023, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003578393084400893}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.264668584146837, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032621365523836025}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.12740361481749896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024147537412165677}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.10957433057544176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020988896354448804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.11095311869025824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001983294663988576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.24430027357651582, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027435596209212753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.20571374170532966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025447737588058707}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.21195460544874364, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002274999033300145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.24692727270265108, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00332540047296337}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.21143151597592733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030797887136175474}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.215850348956597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00282606722123015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..076e487cdf9b9b8b5dc3dfe9022f51caa2eee008
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 9.361606353028591, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1451212245342907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.40668806398214247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004503127021297256}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3430030141127422, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003924569338343976}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.35373295631398277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036780575347303305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.18924602497072013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002957950517929727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.15855760926664064, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002460433442821022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.16320225416214543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024001272796585786}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.30907754734477744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033296328679694575}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.25878608350993737, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002856832446420683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.2675200681652855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026402526220161704}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.3342219925602536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003958418609375523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2812305254099742, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034020232164490628}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.29005865628135225, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032174705384268136}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb9605bdcc118582fff9d6d102b32e803276f773
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 11.414656994987986, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11270862432238564}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.46449466056623623, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004562321628279579}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3842244124504893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038568970386503925}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.40094473628048855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036476150229787043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.22687900450207923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0030906636363050716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1855486684910055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025302721311659462}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.19370105271832214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002470342190509859}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.34573316001805976, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034748050946930213}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.28542502655034385, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029279054521511878}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.2976665060188424, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002731769606142919}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.38244492367758776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004035511089277577}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.31615336301566255, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033904198313919145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.32978958530156677, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0032355902278707826}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b055c0f9115fcd33daf4a7c0af25358a0890cba
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 12.732397361679025, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18621333707534024}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.499291122039099, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004476245438047599}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.40989488096064325, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00370623790346038}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.42999610320261095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00351827199150819}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.24841091607353702, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003130775311395358}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.201675737588955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025289410126734155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.2117257310114321, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024896096126455045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.3689143198658064, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034839108392620893}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.3029948266133246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002899192792529678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.31737123063793404, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027279652357510213}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.4138527816259857, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004010246149220487}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3399362615672215, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003336886045704235}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.35637466690999814, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003195201907601668}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a92d91030be9c7e8db88551ab242f63e35afe356
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 13.577169092929678, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2062668532660596}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.528290529517535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004318140358067149}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.4316094452050039, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035832199286407527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.4548582412316575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003357067322622565}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.2671985172121451, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003111440606188032}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.2158163830401268, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002528320168515306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.22760620908222556, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024740789360211783}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.38911274094824855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034729933166221696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.31774835943631824, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028682350509974387}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.33465045341993305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002709680333869425}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.44003076242549466, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003953562938723459}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3587907276906629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003242670548648085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.37843382736640735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003108090184096992}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..115ed3dffcbaf16ddd53fa9cdc5bd544e7020e2b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 8.153800130249518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19718858686762447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.14254055468502322, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032976710551722}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.21985285950008995, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004764464623700324}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.16625763147318187, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036077916351727156}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.07117779808817919, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018071269269227326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.11165748130728335, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026568526220331497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.0834337726594754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019832478132023163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.11919497082898337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027419572678360447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.18525665107433087, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0040358655783698395}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.13948285594182427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003021076305392557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.12480961931955924, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002918263840706436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.19394917883355808, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004289318267212664}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.1461396427981472, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003226960516681579}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..799b27f0f75b9e5abca344c46c4fdafcdc07ad63
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.82816578683988, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10613794908694005}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5741208333690715, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032791825020937275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.43616299285547877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029918336223318646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4695495704689584, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002364549723893871}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.27417940276408975, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002745882165587474}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2046937372687625, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00215463330838664}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.22091389030064756, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020601818538796947}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4168038088426164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029886435691974275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3132001002805105, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024293669614656285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3383298269843499, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002100531054867843}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4669926334976919, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032045235446491534}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3531706097054418, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027268177176508718}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.38079591964268383, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00234065536986238}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d57d42470fce63e5cc5f51d2c5c974cd1db64050
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.210994141328136, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14395128449450548}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5957938406647506, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032189851910091535}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4666858600777237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029439118893739937}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4992077922612147, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023268998699713796}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.29925246097428815, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028135040853843815}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2311260083068878, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022842743345053117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.24745510003725912, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021577622824456}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.43644170204107635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030000260097151686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.339812800549829, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024913510362416656}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.36403535470109827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002158659011059573}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4937561615312527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031994326527389665}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3863879597392378, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002778922240841303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4134385994397868, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002387535627435029}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e26293f0242115b8a84f8903357573055d222223
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.758419275441623, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14846553640981838}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5969365877168988, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003176934049191526}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47321803649209376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028949729129975907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5048691947729075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022904626291377636}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.3021673329464843, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002773447672105312}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23749889827974513, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023473465141403524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2531409526666904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021887540705134477}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.43729156400302177, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003007804981399626}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34523784521876366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025266731757362768}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.36861382037574736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002208485032472364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4949310693592707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031940660026111094}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39247584122491685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027974990919393808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4186873543720982, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024264140364972063}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..143b5498cde37f049f7070520c0499bb9749eb73
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.9471441955308, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20049087654668654}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5964162408645288, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031670174643345854}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.476041597124549, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002870553977627495}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5068439147529751, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022702971756168133}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.30318643893717323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027688869090984037}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2391458905396845, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00231587546078518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2547260269260162, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021693766012080536}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.43667748453350236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030068254256241543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3473159987716765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025367602883456315}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3700166988328032, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022206494000581454}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.495394213066299, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032160309119094407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3958929166908767, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002827379825231143}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4212470848260628, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024487659358586895}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf97d3c98a93f72f084e1608f9db7bb51ab8d53c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.12300988029553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.27781770916239995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5954584960080906, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00325049366305589}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47888879196863693, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028429629695053142}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5094338990091782, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023180695212366305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.30380630927932106, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002835445529306852}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2402751342075056, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00226887033631254}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.25642542561601567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021909983236262304}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4360299975558518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029988845638560017}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34982928906069083, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002474832286550078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.37228821193907236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002208419488038826}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4966033704013741, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032394543771446537}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39901987621454466, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027430779669785923}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4247238232753859, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024321405233947197}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b030f99ce62e0e828f7b9058bab59774451a74a3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 2.4148460846552413, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05036198012608734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.10560357672768791, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019908255113940676}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.2194638304445651, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003995820797680711}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.1391414255890394, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025400979508730013}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.04417622381086348, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009448315430017289}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.09527403810619732, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00205880226103097}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.05876533984038407, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012344870500762685}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.08914439768173961, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016567906456468919}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.1883312467548728, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003488302660435668}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.11809718257668743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002146842991910023}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.09300944711571149, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017956015291556789}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.19303766058306765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00359467107273429}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.12249168865764576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002289868109882485}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..113d2b1af8a25259d6f28ec04d3a3c7a63a845dc
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 12.297262313527675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16385815790982017}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5839558387860917, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033331147772366473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.44014655260365004, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030139756921077616}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.47525841358381093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023583251642963136}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.28661390078191085, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002901449616488372}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.21116517306370375, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002214474291192635}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.22890209457362648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021147133262873295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4309350512738497, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031735439185340223}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.32067098855093723, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00249989474809957}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.34766320387998123, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021748450214048288}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.48023193119114316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003335283437779837}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3595547681362966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002750841601478431}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.38918206086481305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023621524251225866}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..473b0b97913c40f3b1ffb99ee2fe077e644b46a0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 14.483863606580098, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12131630196525124}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.6058141210991159, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033088115891164284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4741213639409289, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029151815956458198}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.5066707261948298, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002262998972148006}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.31078761548465383, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002885785543521991}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2393070319511852, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002287716976357731}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.25604760044687425, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002140031183397711}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4468805003217975, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031330138309882435}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3474439134529135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025028002612465204}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.37185442092317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021578801071099915}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.5014613640312711, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003302363397632726}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3912761034599153, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002741039283689631}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4185811499353104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023421461714562}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0cf23d4e495d34e254a459079ee5bb677a2b7b58
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 15.191517295510629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11974969580866728}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.6075957192330093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032637968261048796}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4812539130894964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029149831551134784}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.5130996047904425, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002285615607319131}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.31513884201161574, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028405657336013543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.24627662242850876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023207152571040795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.26271197369374893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021634028658777662}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.45074514333996246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030740552400188395}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3560656787748778, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025774337186458807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.37969768710882007, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022273418438245704}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.5067196286957013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00327275943788992}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4005630163928573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002774183370900057}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4274159914050653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023927259878927885}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d27123493087503e6b801ef5fdaa6bae63e3ecf2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 15.66121355308185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16746194817955856}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.6047624718754928, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032141654509862407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4850925511554926, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002903801146577782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.5159411031234575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023173099434433014}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.3140652011020034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028454442972433254}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.24919016906797625, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002393680033053097}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.26501632949202497, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022417359787710793}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4482536974201123, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003084926330807341}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.35780889315761033, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00257596254524435}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.381084835759455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022819361864961927}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.5060122437920864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032490282463882147}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.40566476036911275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002823543129525192}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4315449793197658, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024568419006428172}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..74dae033c2577e188e114e82e6027cd81221830c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 15.723080080667856, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16941632514539806}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.603309962822727, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032242188989074585}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.48695983010677835, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028628867830855025}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.5168974474939331, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022721227951672234}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.31312278266787735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028270590416650445}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.25037720266691516, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002392884966417364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.2656196908399968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002228936591291352}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.44842365936776796, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030644240344688198}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3612862341962741, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025851127132254853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3836187007598527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022832459662179565}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.5050355873000846, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032681356379517545}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4072970583395843, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028059842659635707}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.43255318470589826, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024535753742802357}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..64d893a5875313a876c952412ae5367dcc7a6909
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.10344716943873676, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016259799248270446}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.26090880633509367, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038477185681414384}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.14637569517250587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022073444081297322}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01490073234297703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006589770129935276}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03873031105763643, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017582245582100646}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.021257086751659823, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009366192439889625}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08104406326081663, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011547653388531021}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.20585546458892723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028600607223324866}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.11486954284177152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015736623341304518}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0831734971325524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012946845061314219}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.21156835133750618, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032138252754335896}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.11793220629910407, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017742377889203762}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7483386307387867, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05365818544561724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4b488b7c9d1eeea8719ef30bdc668d4f16a1d85
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.10675646531069118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017079434120986104}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.21254174972245313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035442914228111582}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.13489677526242255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020188404213789704}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.008741098009427668, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005494909789404459}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.019852346836983944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012824372215275344}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.011736790474052875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007323025837186397}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0786172237029301, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012104869162365519}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.15650442119288974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002520544874358908}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.09907990741492997, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001390441723617538}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08565732122109287, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013227204476746756}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.17284931760421135, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029211829599186004}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1087833032639467, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001606809153789491}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.5016993367817201, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04376564683106677}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e4fd081b9cb88c03128f851e35af66bcf223f05
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.16549164819743437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034526382178408383}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.18742513953883105, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003806437750265605}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.16241776794419802, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028848994027258442}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.024661798518168125, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016895454273242517}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.02739768677136293, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016421467568835}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.02339429020564221, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014396242595328612}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.12663862154443478, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026682593533234114}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.14121993779286504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027313372184720654}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.1232803959263642, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021296445004168572}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.12895564267261075, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026773474157606443}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.14650288142341764, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003019787783429863}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12639544876411474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002202317812711371}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.1863991054607887, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13126259759197448}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcf7a91a60cc47fa3bb8caf2899884b19ad7f88f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.199491926427676, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004346987952839152}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.19499186373063174, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0040083275173924}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1838191377493394, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003540784118642653}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.039281858500698104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002301025224624484}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03776716790343215, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001979743786030461}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.035667825755264795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001908336809093584}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.15200765203777006, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003504702651308}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.14674619587776616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030683749877275252}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.13905902806555284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027923693461365568}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.15365853324218348, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035033293697679137}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.1504264543019513, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032229831137894752}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1412465961407836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028176425939316443}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.927416551659746, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13768130191760655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..94f520ad2a14c52f1e9038c4ac4cd446e9e9a7be
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.05932877286153886, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038035599974941344}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.04877768076364228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0031918925275879794}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.04942769980269383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003100893627673165}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.013087116001066186, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015109923619292477}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.010914081354624636, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001152072609333735}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.011000794695724792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011742366069366823}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.04606934608031163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00304472521629779}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.03712032219694214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024797647071667997}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.0378284212328296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002425844759650908}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.04670525345751398, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003059955056808382}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.03806261734141931, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002530887858069215}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.03853556917570637, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024502420061577436}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.06705573747521477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02166430236564802}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b86801733a4e29de67b107b885d23e8cbfee788
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..776e23c59fdb15175bc5e1d0a723f802006bc239
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14574516028104512, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018712457151061363}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.3449236724935976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004320383977323064}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20225260124930794, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025001046641434125}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.033711341577120454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010598029070401104}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.08308586968025274, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026382706337572917}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.04728640552761858, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014730864192413137}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.10980468800637298, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013446137122648245}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2608506185694507, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031928603764024353}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15245422876205902, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017970754977725638}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11677865105898154, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015785067645458074}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.27805190370113925, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038070316552884773}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16232577634393053, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002139935162676087}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.7621407465701262, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06113609142697404}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d37cf482e59d90b4ff119ecd559ed23d7df2c46c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.163542980651184, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003081723626596105}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2810483758189185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004490053395879538}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19140416518663397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002823344391795949}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.031135203534715963, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015263295233267145}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.055979904675050554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023805460524915123}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.03666889011459831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015411558646631956}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.12253980798125551, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024936905994992586}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.20840253955539761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003332368535118191}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.14222920999260133, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021284776195994895}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.1285570844683415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002531324924973325}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.22276474556133527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037612883055859847}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1506050075529136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002294513585647674}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.5243771210089576, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0660301007465096}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4eecc099716f8c93e9ee2591f733de6219fc9865
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.2525132293787072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004048687072651401}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2514932935540875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038107088016241307}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.2367541422956314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032909140337269572}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0575454237989695, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023125091362061257}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.05665840523908167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021563440811016026}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05361519189651879, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002025045267401813}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.18999003721752228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003327472779717041}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.1883122635578014, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002983932837681397}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.17748346894897998, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026653380372031374}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.19185044567781862, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003315470542686825}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.19181396779702334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031212995449732017}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.17976252206146795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026772946258546014}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.663637021745117, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1489764859030127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..30efeba742f236e7eb58566e2fdae5bacce10347
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.25963319479441493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004586241879090658}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.23447118873035172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004097558867330643}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.23342689177585182, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037455640107493257}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.06148774851494584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025664242616005484}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0555392665292451, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023190320837292366}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.055403348824886504, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002239667742211083}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.1935508847274175, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037493722409142932}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.1734702299724183, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003172612394305438}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.17288647337982307, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002961595593698407}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.19489189124727585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003750963505043716}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.17573627157693272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003277150547151035}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1744328426789123, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002981641820828236}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.0167779291606345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18715886411594176}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..296c00ae1fbb4bb1030a41a56e7fb4ff67517b7c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.07096229489647175, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004454103631846769}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0537747056537198, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034117193891468585}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.05658003625774032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034580616635425912}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.01908631835669933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002058223769449137}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.014089456125343345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013631204130799945}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.014915911149834407, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014210306358262497}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.05488232507714324, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003638011853955081}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.04007173199742235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025968781288273827}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.04254235209217209, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026641802157739655}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.05528538478732102, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003650048657550538}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.04060499621647696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026328638098159266}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.042987990330416365, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026862384531954266}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.06857179391271788, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018883898270683333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5705168bd872eae83c84ad2331f3e28486f034e5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0025216182263907015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007538054243947512}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0017753801346017414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005160214923546179}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0020234066276709393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005877963929551324}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.00031905987094666343, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001753545101664957}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.00020940537449971414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00011249708102079408}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0002512424682236003, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001360024645875312}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.002044972230876781, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006006047303809878}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0014822793037785856, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004337171048834188}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0016644800581129724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004779782544389454}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.001991370172557742, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005886709786944241}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.001437140728352026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004219329829170707}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0016154724619355648, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00046538388224745525}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 5.985570341439483e-44, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.0145592778949524e-37}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f0287e461257599a4b1ee8d6a5b9866cf6baeab
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14108556952980922, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020956329882564906}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.32629804825161535, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004756174570847187}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.1938850065877847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00274725146396725}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03392541732989178, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011674749125079006}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08137541100012088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00280912091771862}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04697902966116714, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001582424189239984}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10765433273641108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015745202745112603}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.25042455718394274, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036806317265347037}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14812739050025472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002066618016210627}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11258249311328207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017981489130128542}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.26182933742867265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004140212650666321}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15484724382764298, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023502913687865295}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.8671229080125915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08065653508825953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1c8cc2e825a2299c871a8cb96e97d9fe527b1bd
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1818208364925118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033811439409721453}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2952342650035309, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004323771519378331}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2072547434350325, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029185590817686906}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.039255544280436615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017970345435637744}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06401578769657801, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024164697736714135}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04454429056503735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017367902881785364}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.13745172437510383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026008926362907463}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.22412058258654055, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033458865795404046}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15664138971417202, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002216089409249802}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1399352840204044, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026319234043525545}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.23050320162406268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036405761255609054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1602204327991172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023319587866224892}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7643751753804513, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0800024640836125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1be2731a787324a95d4d939a0a56d86d952e341
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.2472505852052675, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004009850648659755}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2567441304949634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038189444756239026}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.23644320153754822, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003260843172949545}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.056866160489061936, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002420240118413503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05783181540065835, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022927551175836464}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05354419366516433, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002106078230291637}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.18570698243499414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003303016306234649}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.19264012368473676, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030469236090590076}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1770324612357477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002634383182795547}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.18746784009032694, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003301909604388057}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.196059514574862, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003205680714038162}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17924796506451007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026665867584947088}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.705819603984223, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14137667865787865}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fde7a60ad9f72cbbdbc5e44dab5e24edd224be1e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.25083050440215976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004340710144695701}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.24313830962431818, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003987068682586417}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.23375535546317172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036448754340216603}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.058302376165758424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002462019491771468}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.056822850129455496, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002365353936727513}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05453694888387553, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022209047576266894}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1865779794802579, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003475251260232787}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1815458312356556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031953873575430005}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.17375448749498826, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002893029439925038}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1872796185903397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034817440259912887}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.18253965500420624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003256036671696088}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17450339640702453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002909832621947404}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.1173819151981363, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.23709102615992358}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f95711959caefc9e86cc1797dd0cb8dd39605b72
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06668389338088138, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004287951717923472}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.05770356648149944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035956507787910136}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05594753186729229, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033676619143302084}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.014963307619343467, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016114455998580692}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.013741449593915791, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014260391208111062}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013296321991506897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013642061125793434}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.052652024080707335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003605493741125939}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.044415851405158295, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028463083421698503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04310400237676907, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026639130697933863}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.053307788896004335, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003626638986661626}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04533527259366541, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029075081420929395}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04383519297886595, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002700540506868496}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.15136892255345546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037709514860432364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..49528b6392b7feea2a47730d8687c6bc0600434f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.00273620898511371, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007798516293741871}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002264558087005939, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006160940919096034}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0024320284759234414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006736354616386897}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0002196833564758093, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00011015503412814092}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00016199733180865256, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 8.184314454744501e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00018583265458717857, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 9.342731096527664e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.002119565890814211, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005829990564075634}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.001776402682704595, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00046929937422078794}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0018932762031196894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005068809117616748}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.00220984304166733, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006273058335995559}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0018523030709016646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005016261609450595}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0019743733562370912, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000543487074589739}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 4.919560458856041e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.5763240226586303e-34}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3191c1c9cde4ee52539f835baccd97ec47b3d3a9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.13877902954547866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018329219241202802}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.33229446957580333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004296754137342145}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.19326845352360203, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002465944989792081}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.02676204259801757, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009319137963402368}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06705747449853536, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023932732420159494}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.037715758291965086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013073049053882384}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10017275416736435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012836317273353171}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2410010033017358, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031284342960650143}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1395940328246557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017302666137824698}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.1111454306741116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015327186901242722}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.26752711604279944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003721123931794142}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15498057414846977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020821371700032684}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.3922923422113362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1085694848628761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..471efd9cd3da410982265718ac43b979ab43fc43
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.15438000924889625, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024173163816097426}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.31099805974371736, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004369924984110172}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.1951883864682122, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002532666335194767}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.029200709009449446, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011802714510965404}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06385551767647898, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024020535837821445}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.03803777610925088, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001401187389723631}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.11349074308725517, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018559801970629032}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2282612664387364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033052644499383292}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14290021004189724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018761553752169517}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.12144057591718488, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019272376236010539}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2475391717657514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037512061755519796}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.1540465769798701, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002078015927158577}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.5418959887061783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11447597977337433}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e2e6f513e426aa12613ad6c754570299672455c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.21235407890938018, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038584821309733197}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.2922415035902346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004426499849196656}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.22345527223273026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003099961120505977}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.048280692831307814, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020950935786571852}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06626083711364002, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024510193668861795}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05031306585865172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019103069927276113}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.15845617681932553, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031065349816888337}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.21629686584539104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003363162795489695}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.165786008724537, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002468629343967069}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.16392354764710418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00308613239016732}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.22961193568282157, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003797944820865862}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.17351507856807527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00256723897382965}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.087587201632086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18248784506530719}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f568ff9786848e2d11744ee8276c8da5bdd33920
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.22711875267734596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004285693842994407}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.26630999509216713, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004480479149823449}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.2231300082597102, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034033752307014967}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.05345269343858552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024415003575837513}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06205142343969058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024890375706590195}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.051537755471058315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020776538363222225}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1703206182001753, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003447489549005928}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.19844320254005263, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035032133447970293}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1665572765578783, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027440313935594893}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.17403775435701263, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003413819988075513}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.20793289910577953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038444261569951475}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.17187047667566516, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002787059466070908}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.50653501943938, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1003090971755565}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f17554c4a1c28492c5757ca0e6cbb2f734ccaeee
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.06414485296013633, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003867582234098727}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0654670128693392, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003934205471736375}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0581230949714921, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003314213174877425}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.014926376522099313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001564843306679548}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.015583289386150202, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015885386879519472}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.013514623561041836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001316872673175581}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.048961063263220264, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030673084914120074}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.04867370780315506, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030019830239967214}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.04353495089271025, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002552190003567289}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.050693442956428234, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003120026662916494}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.05205657833407217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032602925531358337}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.04571485896067389, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026608912164232407}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.36982267172015404, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08144270702165075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..79ba21bb7544398ead0f7061491d9f6718cdec60
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0026583117523739054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008091018808969425}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.002178705467661046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006454400480476721}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0023608978474116056, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007081747278920328}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.00045740423098913663, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00023890947709324936}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.00034383593345857494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00016914370471971473}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0003900360808394216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001966707194013907}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.0019692588599943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005871311929633991}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0016273983057241343, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004734697044949343}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.0017521055869624688, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005149050874063948}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.002125175022613541, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006406715438574456}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0017610229095255533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005181385995086363}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0018951492030044802, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005633639594497325}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.2867055492649607e-40, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 8.25672229885246e-34}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..13a90be12237d06fccee0a1bcca202684013c115
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.14681218319635403, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001884064620986668}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3508532628212256, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004480396836342542}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.20445551861577294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002540631262237117}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.033083758715882615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001075607005127185}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08359653371387508, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028011003001917915}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.046773184998710506, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015164232120699853}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.10884861486262903, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013621357621873037}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2618029187147605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034269796199026133}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.15179708394425984, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018576395558564697}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.11718412599487378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015873957952529678}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2820874940255534, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003951298591378739}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16353825551028955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021770267011811915}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.814930387249839, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11771121619409691}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..98026a3f67996c3114fdcd8d7f604eca6bf91d23
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.18103718501348756, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033473865145306047}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.28733295807206627, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004482174464490647}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.20309050137464207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002889520835173405}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.036419012458562515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016871059092369717}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.060311180548127855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024245045640873164}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04098230590585054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016199850679624347}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.13558174388743888, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026523267315021882}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.21359154903856134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034331297139569583}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.15089891356003465, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021828562095624793}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.14117090393817158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026656506352231435}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2267992928621584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038282221891750234}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1586506104203225, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023161718459253386}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.6291107349733265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09557892175976454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3cd0e68863ba3e7733fbe4eae5a9cf69189fb98
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.22803370323125136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004013273187193158}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.2570614680151902, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003783252940020481}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.22521229342157734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032317381563035028}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.05022811623323242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002274677428345127}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.053690747036890564, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002164698578030171}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04819699499570192, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001989176916835738}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.17227480496190953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003234340611642025}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.1942078038523242, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029673906336968522}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.16991437391265418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025765631696274076}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.17464672535677234, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003215033545422968}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.19974282808154498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003175109736049393}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.17319699662992305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002605048780862527}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.17318093098099, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11661399106251272}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..390c03951aca6bb4c3ba4bfc87ea3aa9ddec73bb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.2309959894697129, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004585855209595658}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.23766904690279728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004100502142008435}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.21933409248379987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037339658935042247}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.05495177086256812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025120253907978123}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.05330679489401812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002186452458727165}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.050299856364729216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020740941892729912}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.17327034438097938, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036171037821970263}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.17865802097677572, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031893640720526708}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.16438642156838906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002914893477118624}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.17524258448363655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036164027764869844}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.18241558682117898, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003320041583026863}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16684760046910785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029393030131824823}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.616296095236472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1539703441634834}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..43b43f3ea0bd55708b5caa6b1f416ad03c3b6b8c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.058089805059648995, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003880608920827592}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.051805567759303485, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033664922868554847}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.050341607953639705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003194838646765286}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.012863518208923428, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016654669126856404}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.010423647977701705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011866496937258718}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.01044448138213699, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011835889688472476}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.043703688081186784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030624480401055063}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.03767748319262427, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00246554753588428}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0370337628617168, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002402644072481797}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.044789301089212794, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031139172812709205}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.03920567863599592, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026011812719479548}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.03817599729757219, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002470465261805821}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.08641483709606773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.022651861523459035}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dfff9ee897b7299990d4ffa9f980a6637612dae
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.001429388221841052, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.000727956993087229}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0002449280655247975, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00013374223120064533}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.00040066185006150704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00021090992858783893}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.001429388221841052, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000727956993087229}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0002449280655247975, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00013374223120064533}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.00040066185006150704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00021090992858783893}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.001429388221841052, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000727956993087229}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0002449280655247975, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00013374223120064533}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.00040066185006150704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00021090992858783893}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..58c664f64db51a715c7508eab45576e0b9b34f81
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 5.867916213031358, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.20628853227818275}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.07942228360422791, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0024044279301916396}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7054367698131732, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006632442357897306}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.1298804110387953, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0029630718347134926}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.06361144656013536, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0022825023684296315}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5472295953999456, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007985229704536755}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.10376354583297816, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.002967905827484413}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.07863671663541807, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0023952286615590386}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6995132449021918, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0066928017177045425}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.1285735663218738, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0029483810070099646}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.07718983804306988, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002403037945378598}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.684699153715215, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0068762487858985465}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.12598534747867804, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029624549245465407}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7aadb571f372c8624cc139205c118058242fab23
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 61.335222795778705, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.8622771243249686}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6980665022626561, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006567707150422072}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6593767143434258, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0071185255670560355}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6595089902989296, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006875712478457699}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5473256014844189, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00793225012107229}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5253208631681442, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008110275114235748}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5247847660788326, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007965430836769843}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6804243326251539, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006774860773805959}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.646527809236356, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007333307687760897}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6459449348115168, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007108094606646681}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6845034339007631, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006741022000997241}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6485924063401273, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007291349785460433}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6483381084295872, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007060527023002106}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e731fea1a3f7992c65ebfa992e06467056ac44f1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 62.603402535205106, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.5475644169974412}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7181416985150058, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006252053466859557}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6760628114661585, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006824731568876389}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6802649706088744, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006555775754673784}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5695607426529345, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007695163044773544}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5432880806274307, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007893817566829577}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5459465123609488, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007745165296981821}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7019668030881252, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0064563320671559895}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6641579045487708, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007044290418742941}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6674947636505139, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006791046114300905}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7050246857130568, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006423110240312336}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6660735324572596, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007004108761350961}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6696194322813839, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006748257374785954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..652de964c4e7c756ab9c0136d997113eec786d4d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 64.06004196791218, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.0446093860205532}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7232440140186012, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006116632364498603}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.688581159330267, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.00663873709207145}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6901181418640525, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006387092151812126}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5753318864237054, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0076304447710168035}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.554044381568116, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007810039162687954}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5544379445377776, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007668802228151554}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7068401047024229, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00631934193944798}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6768332672562108, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006862112088829738}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6774190073052743, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006628030197945172}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7107370870856662, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006294581178188967}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6788431055005977, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00681711472710704}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6797541072212353, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006576199106862475}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..39dda8cd773f88c4d3ada720aee8fed81142d8e2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 65.55325196543947, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.1947127408461486}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7282001628299452, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006078864792051913}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.69880441201827, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006520733587303046}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6996217800102895, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006317598090843756}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5828059508699759, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007610735477678452}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5649714199883233, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007745097163747005}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5652369927255047, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007629351400576189}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7140909753910173, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006280916318478117}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6878212259946481, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006728703316848708}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6881606645893026, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006539431927546592}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7165909256663964, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00625485888706704}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6895293461590946, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006694901252546183}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6899882354334018, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006505404310157879}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6118b03325897d88840041afffd338d65aaab683
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 66.64515267205795, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.061858736228344}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7294671013001317, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006065785974127885}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7053377398801107, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006417448274249115}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7046939479107203, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006233955812695059}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5861943307872125, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007582514378127309}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5693588626949632, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007747442919717387}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.569623452216584, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0076345097393273175}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7155258182519211, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00628300942244068}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6941393539742509, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006646700094687056}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6930681146662367, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006476139258559119}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7181565430000993, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006242949773749711}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.696180538077367, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006602511857250467}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6952120905322114, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0064310406471001225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a2bb42156b8d49acb5134380992486dad98b10a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49455930359085964, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665133500637059}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49455930359085964, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665133500637059}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..42391d5a51b1abdcf471cb95b3cd0e4cf17a96c4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0390163214100f9dccd9d3ac879416d205bde3c4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664055982032842}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5087051142546246, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664055982032842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..84e601c9f653b343fbbe746e62a96fb1493059a8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5184983677910773, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011657837583818161}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5184983677910773, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011657837583818161}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a1d7382e04664ac8cc269f0764991ed38b45c81
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5190424374319913, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011657360703051447}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5190424374319913, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011657360703051447}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1034a3be127064e138091ba07ef19d45fee5f526
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5206746463547334, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011655846995729705}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5206746463547334, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011655846995729705}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..576fb0706044ce2bfeafe690c50476f62ecf8eb4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.1485041698567138, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.007315927282004117}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.01974171386059608, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0004952361009360189}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.21534097998297544, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004166248254601441}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03437889361344771, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007987086688652439}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0030525841083055595, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00014616090889610437}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.0367404892573986, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0018644594321576408}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005373478710058346, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00024914938847529126}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.01790945229515576, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0004313558223644517}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.19829194789252308, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037997755232370687}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03125352104038582, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0007012062341285741}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.0160655540543357, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0003987980222331984}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.18249084236327845, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0036813208394015054}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.028029841637843726, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006415770236915785}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc0a6d778c0c7836ca7c484be17ed824ab151c0a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.29412714768586884, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.034360669972773594}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07368232247410113, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003164750969347226}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.15766787120459927, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004114679609697215}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07158904393328168, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0024799328487658226}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.015588545852027387, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0013404934159168128}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.02749471159881858, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0015931425342235485}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.01427616756655093, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0009774976022813842}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.0613981722284925, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.002639261551928193}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.13877990083713132, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0036796850148855835}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06022053549130187, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0020697663168887737}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06279067333856057, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002781722984417726}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.13573872320792, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0036419851235016585}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.060445332342990314, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002126572966319134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..529e29ebf90bafc1e341b9f1be78403cee852b68
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.7257226104526276, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.04974444170266115}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.06838800622837515, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003415935337029084}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.07342671774462553, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0033235156125216965}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.05613337557154679, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0025574265364955565}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.017058473371860172, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.001674645431479306}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.01682784685476785, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0014448109257036853}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.013394834397133638, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001173056970239741}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.058968248594693275, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0029726310561432943}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.06438738644094597, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.002961784919389523}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.04866283010304908, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.00224397834501404}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06049770286352172, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0030730882420140623}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.06461911786967894, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0029694396182963407}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.049582431239121254, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022971793124355027}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e38db8c77aa8cc222fcdc626cd6a647eb83b12c7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.618645347711144, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.06077101383767482}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.08132178370310313, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.00374448816425401}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.07900250321756425, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003360802844922738}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.06473304019823428, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0027236205074401805}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0183620415895074, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0016611524429967466}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.016974165028495372, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0014683224364738306}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.013861400466696013, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0011327812094035596}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06893019272931476, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0031686455340987937}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.06953656378352016, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003022144575602847}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.055560374775413285, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0023456775505970827}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.07140577564724335, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00331410052405056}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07010282395858061, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0030213293550535717}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.056891001156107575, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024083482693287536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb12a98508c7e0f0aea5cec7ffe8489299a6ba17
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6316373732726734, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.06439416572051122}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.09702203366548948, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0041167329926756905}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.08876078044435651, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0034053343174592437}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07655031903659895, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.002949142358861536}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.023735273095661872, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0019962706229605222}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.018705153963296396, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.001528405714423096}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.017184076709079645, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0013681327172434656}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.08290890093834904, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0035342470286738145}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07811395555950774, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003073914358976181}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06625753359506499, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0025972013724176652}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.08594629250205217, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003694845579474083}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07970107699146455, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0031266107330673583}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.06806949198182898, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002661868418015175}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4853ce9764fff017b87c7a6aace0934e0b177cc3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.7515007260497569, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.0607371553698586}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.11041227742052115, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.004319670141318303}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.09875052543479973, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.00361264598137581}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.08641128265874654, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0031216881179551023}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.026424633415174224, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0020835974054162335}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.02144585619916054, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0016515571686378792}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.019362442756114775, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0014460076587289445}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.0955827666174569, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0038147197500493523}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.08729998313244142, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003273426860969895}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.07523875031649757, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0027701164872551303}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.09853735471232261, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003943435886305644}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.08851997873549314, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003306265435627653}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.0769072306358556, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002823987889826904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b78c3ed6412b497499bb82ca4b93f51af9ea78f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe47b4c8eaedb77d62dbd270547ac325ab89930a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5032644178454843, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665575530760367}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5032644178454843, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665575530760367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed94f3cfea7fc1a27e60b8263577eb6cb920eb94
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5108813928182807, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663061261117746}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5108813928182807, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663061261117746}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..74152318eae5ff25aa66241551336d0288ef5030
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5032644178454843, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166557553076037}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5032644178454843, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166557553076037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..03892d472ea065cf3d6904a347043fe3f4ac39e0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5092491838955386, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663828032649181}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5092491838955386, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663828032649181}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d56b1cff0e2ef6651474fdd0cca8ae653791d45
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.499455930359086, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665817258899177}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.499455930359086, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665817258899177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1214070c03977b5deda7adda564bdef8d3293155
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.573993471164309, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153737544851944}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.55930359085963, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01158347809065713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fbf78858935a5e2ab101e732474bc8d3e27046c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5658324265505985, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011564264866016057}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5642002176278563, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011569259195486625}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..89661947285e23b82ecc06ab166b34dd1299f099
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5745375408052231, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011535468840824526}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5712731229597389, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546694435712187}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a611117a405ebda1f6c9d2a0f0895ddcdccb78f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5560391730141458, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011592322161277832}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.558759521218716, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011584987344513572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dea23d61f30d63ae5405aa0e14522c7b5cb814c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5533188248095756, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011599305042745072}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5549510337323177, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01159515750977576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d953f8144a67cabc5dae8468935bf4fa589d3478
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5505984766050055, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011605936624156075}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5505984766050055, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011605936624156075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..93e0c8e87e92ea164678387bade7c9aa8f6a7bce
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.573, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01564978964446222}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015819299929208316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..737d23c3e3b88e9523e7492ce2a81790a41ece2f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.651, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015080663991563097}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.624, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015325105508898134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca826cd83686e1d887ad3c58dfabdc5c1827c3e9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.667, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.651, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0150806639915631}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae47d7a4dfc5d9e9d89a39e5715ba78bdfa4ed08
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.666, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014922019523732967}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.664, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795023}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c74c211f4fcce3ea7006390332377c950afb6abe
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.678, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996669}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.669, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01488827258820394}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d9a6281c1e32a9b89d42a8c673496a6694eb425
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.674, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01483050720454104}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.664, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795021}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0df4fcf17f31961891b0b116d508f327480e8495
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.848, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011358918303475291}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.768, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013354937452281572}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bf855d4dc1f2e363ec34cddfe371bd23607964d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.895, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009698921026024942}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.889, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009938701010583726}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfb8f871eba2e81da5eb2715c3b54013fd283da6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.905, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009276910103103306}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.904, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00932045443478322}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b317be00a90be8361cf4cb27fc1140f0531c2af2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.908, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009144376393151108}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.912, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008963053962592076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..24e7538d39d9580efc9d2b41b99ecab0b377104a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.912, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00896305396259208}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.906, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00923305200078773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4a14181b820613fd712668d38b46e83983b6f2a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Direct-Question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.914, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008870325962594766}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.912, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008963053962592076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..67a03a6b3d43dec4d9eb3e662300da8c2286f274
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.346, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015050266127564445}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.35, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015090650341444231}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..87c6366fb1fd2177d4422b44452c866e7df2869a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.398, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015486634102858918}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.409, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015555094373257942}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b23117e12d921dc1d5af3634a8f5105b14b5e21
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.374, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015308767369006366}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.384, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01538768276189707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f10b9445d45812e2c10b7fc818c7b3c4685ea293
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.405, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015531136990453045}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.416, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015594460144140607}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e82ca45eedb9baafdc4efd9538e970594e3fecb8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.395, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015466551464829347}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015499685165842594}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..eca4f27fb5afc3624ce5007097500b93bdb0fb11
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015499685165842592}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.412, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015572363292015097}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..767a8976596aebc594b43c33fba4b706a5ffd7e3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.401, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015506109745498318}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.388, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01541731797991108}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d79beeda82a851eafde56e0fd85ebde3607738e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.357, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015158521721486773}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.372, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015292149942040577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4f42d9fd9afeeae8e6add0c1c920bdf3d3f1fdf
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.377, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015333170125779847}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.396, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01547331326585941}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b9c3da465d8e9417b1b4e0b82dff65cb65ca2ed
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015499685165842592}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.412, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015572363292015095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5c0a24ae2fad7caf70838332d71013e785f8c5d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.405, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015531136990453045}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.424, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015635487471405186}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7140b57ff2b823fa5d288562385c22569766fc9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.399, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015493193313162908}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.417, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015599819048769618}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbae06a475ddf899471090b98e66283d1d7f9266
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.366, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015240612726405754}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.375, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015316971293620996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..368fedfacb3e22d1939d25916c14af4dc6eab8fd
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.385, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01539519444541081}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.397, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015480007449307989}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a19922c711a83b67cb6c96634127cbe11f70730a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.373, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015300493622922809}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.397, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015480007449307982}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbdef9dcb1fb86a7fcf3e6a919f67bfa962a27a8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.411, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015566673418599275}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.413, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01557798682993653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b72e98a413546dc266c0bbedb4e0ea513fdbc0b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.417, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015599819048769618}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.432, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015672320237336206}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..07652c964f2de60269b907df10ffb7fd36aeaaef
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.407, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015543249100255545}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.413, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015577986829936531}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..df1f38dffde64df24367d5eb490e8d4c19cd62d3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.5152324959914484, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557065368348288}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5344735435595938, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534917341355127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..335a26343d780d95d9120ffb72bcda2e74389f39
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.501336183858899, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156239096465875}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5093532870122929, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560409019420374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9fa4ce530bd290c7292f2c7e991ba0ace22f3ef
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011545573278697239}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4740780331373597, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546883081384893}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3afb246618a486e66af9dee044d2de282955c79
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46766435061464456, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011538227692217273}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153229486915312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eded8a8ffa6af8f73d2e030d7fd8338fb090011a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011524715486240652}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7ec503989c9e5204416cec857b5c4fa0e9e5a37
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46018172100481025, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011525709570367509}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4623196151790486, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011529552555884573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae9dbb769e3d69189902fad59b40156a4a0a6867
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.5163014430785676, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011556285484521566}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5291288081239979, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011542794417345719}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..96343753b910060ddf64189928cae7db59eba0d5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.48850881881346875, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559378273599121}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5093532870122929, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011560409019420372}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bbcb6db8cf88a28e662dd221150c5e3acc6c513
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4692677712453234, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011540570846495546}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.46873329770176375, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153980308563773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad4cc434a2a3ced5438572bd614eccd01f9870c1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4564404061998931, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011518470676766509}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4537680384820951, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01151289919986303}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..409193500f9035ece10e1392ca5636f9539ca210
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4575093532870123, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011520605695184078}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.45323356493853556, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011511744771088355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..48472ee7e0897347643671d3d21129905133da73
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011527657726586461}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4575093532870123, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01152060569518408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..21bd346ba9ceeb98346488a514dd769962633c58
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..779abeafaa63e395db80277415ea4e6bb4d1c764
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2124deda5986b17c39c111671c83f6b21d62551d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6acaca0b99f7c7c6b8e095313dc075fa0873463d
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4822049ab167d7cb8177501b832aae75302e80aa
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8943c761d24a8bad7fc40331fca6d09a96b9635a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..38ff90a7b430059445cde9c8e2b69dbf99956322
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.5173703901656868, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011555452669106634}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5243185462319615, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548748301487312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..57f7b43e56a990d2403eb564a7418ba441bf9909
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.49545697487974344, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561954965856516}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5141635489043292, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557792331301676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..16cfaca152b3c96acb10abf28666e91c517cf631
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.46980224478888294, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011541325320336615}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.47835382148583644, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011551591851683338}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cb929f10e58bcf4f71126f774290df0b71d95d4
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011551049647290314}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46766435061464456, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011538227692217271}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..25bf74b6b62de190a5bf135afa4a9825d82632c1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.45537145911277394, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011516282203726655}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011527657726586463}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1c4986a93c1a8d66e8fd584bfe486bfea4b2c14
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153229486915312}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4575093532870123, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011520605695184075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d16e4339944055c8aa0483ef6f12503f908783a1
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.5291288081239979, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011542794417345719}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5403527525387494, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011524715486240658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f7c38d4bd0710f9e8d2155cc1f5176b066d9d06
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4911811865312667, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156063365695297}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.49599144842330306, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562060664045738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4e5234c69a0d092e02c68f18ca5f975c9cbefcb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4633885622661678, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011531394084549621}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011536599118298163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..931c29e9ad8180e51e061d6d422702b53ddeb173
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4660609299839658, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011535764881641411}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011532294869153118}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..acaf912f3da94a0c2e165fd8fd03b6c1a3defe5f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011534056494505866}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011528611805439891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..294ba583f46f6c69745abb349c704750de832918
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011536599118298171}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbb1fd08960ab0ac78466e5f3e626115e1930a32
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c493efe304a009ef58e1b2b2fe0105943867a4a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5090252707581228, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..810de424239c41dafc6409a5c93153968371e1ce
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..01df9aedbc5753b78a2d52aafdf6020ebc411e5c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.516245487364621, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f372f90b108ffb5d0db1d5a25c892dd5fb772d89
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9378b52681b359c3a94ea7924b9660aa90e69e20
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.48736462093862815, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.4620938628158845, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..caafc45e3e4d47982e33c9ac8250e887813e6f61
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80f6d903e5bfc28367fed337de694bdc238fe5ef
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa8e1d410a67ec35ba864535f9b30d3cd77fe7d2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e490df9ab96df3f7440fbe7ef84240da5af12ee9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a12199909a7a5ec723bb5805964c7798f43293af
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4548736462093863, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029973636495415252}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.4729241877256318, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0487cd18895e28b7d2b4b4f7a08154644718b036
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.44404332129963897, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02990739633379599}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.4620938628158845, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030009848912529117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..75045f3e3c10092a0f8c296e79a7d5547e482235
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197816}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dc79929fda27f2c216ec1ea88e247b2d554ca5f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e54b9ff1368de691e0844389390f63dd099c83a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.47653429602888087, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a54bf510718a242b8ac7015277ec018cfd0d0555
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0300727231673172}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1adb46d134f05d7e1e7aa0a8872e0e649fae7a2a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.44404332129963897, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029907396333795994}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.4657039711191336, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03002557981936642}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a2cd789c12cfda9c54e93c2809a68ed79555442
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4404332129963899, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029882123363118726}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.44765342960288806, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02993107036293953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d998262f8a289c3bc1db287d5f1d9bfcf34e4a6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad2da9b45d067fc6d14878420b83ce039f4fe7b9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b49e2943389215b2f15a08dfdffed389de7fe69
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7af0415ab36494bade01fdaadd57ca06e3f8645f
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2b41c7c0e5a4478161df1a92b2672f7b821a7d6
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.4657039711191336, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03002557981936642}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..29c945a579325c0eed5e853676ddb47f25849bb9
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.4223826714801444, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029731622646495887}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.44404332129963897, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029907396333795997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7898910d3080a8e5dcb1bf8698920e25bb39fb5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366426}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb665e5e06fde293867fecafe9b9ed60eb272d94
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a611aada44aba92b1284300f0092445f562e297b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0f7022055dac0233f2a7a9dd29e389c9b66f26e
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d17c3d14144c95cdb582a4b5c84b78a3e480d6f8
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..653028aacad972ac953a42938e74ea3362067acb
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.4729241877256318, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a09efea3358a8067d04752c96382d6d8727c5f87
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978601}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..634c534bb8bbc93a4e50c113f2f3000530d26622
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5295974743488555, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014027843827840083}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5248618784530387, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01403510288362775}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..38ab7e7d69183d2d8cb57241784b9f536d71a865
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5224940805051302, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014038257824059885}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045826789783665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..06cbf097904742d8afb7d7ab4c2b161a441398a7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5217048145224941, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01403923921648463}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405090552122858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3bf9fedd09b8a06799ba25505f91c1bde4b3fff
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5146014206787688, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014046492383275835}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..aad4be98bccab1e6844df01723df026681217f9a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_Replace_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404827882040562}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..15f0b5aa93c0e554a04fd0f3806c7e5a885b6357
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051956064076896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfd6a4ecb377a38da8f40a4f464d8684a705afab
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5082872928176796, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014050555322824189}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5019731649565904, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052376259225629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a99859e1d8fa25fc9f2a079fbf0438553efeb067
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5059194948697711, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8083b017fbaa78ea591ca09221d52453036a9463
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5130228887134964, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014047718393997663}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5130228887134964, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac18cdedfe76127b8034aceaa33487647888ba29
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5098658247829518, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014049749833367596}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014050905521228573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b5575702225ab8314291790360f4c61d8762fb3
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_True-or-False_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5185477505919495, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014042813708888378}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5240726124704025, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014036189665395132}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae68f78dfd17a82fb233612990c3b6d211241f92
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051220692330349}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.47987371744277824, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014041096664344324}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a85cda4423c7a6fa3736b42ac884a91c991d075
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616441}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..394ab24deed916731f6e939799f5fcbfc3c2cb23
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4877663772691397, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048278820405612}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367585}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e48dc90031e91d965380b7939ba5a8a52377a339
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140505553228242}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915845}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..71c2faf83b49865522aa7a00a34ec7eacca63907
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616441}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6fab2b820c535d3f78168100013bc8906f5096c
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049512}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6c62ebc94476df3566504fdcac8adcb6af91be7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5209155485398579, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404018549421295}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..193f617e71d6c4aec6860bad0f3d5307767ac58a
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047718393997663}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa52c0b380572a8acd997125dcd569757a0393df
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824194}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140492945362904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e84d124afc09e335e551f32f6a4f2bf7397816c2
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978596}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047718393997663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7aa302279dbba2ad15a19c9f1be19af452b8129b
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978594}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5177584846093133, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014043619596174966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..73a32e1f42f69a59a3a89a5713f69cf9e2408421
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_stand-for_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5209155485398579, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014040185494212945}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052131146915867}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_0.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1b41c9e49dd580381111668224f8bec7e8000ad
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405017009449771}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4956590370955012, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076906}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_1.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dffc8a386e30cdb4bb2d6f4a20b90095240441e7
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4861878453038674, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440419}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4696132596685083, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014026510839428744}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_2.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7213ed1148f99c30271520c7635d93bd178e62f0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.48697711128650356, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047718393997674}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.48855564325177586, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048804199859329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_3.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..df35c38d7c5cb631e312d6d5dba773c73433bfb5
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_4.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..419895323dd867c4584f9dd46087ddc6043ba6ce
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790516}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_5.json b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e31f1923b5f0365261d17a6eeab11780e5e389e0
--- /dev/null
+++ b/4b284b21boscar/eval/agg.4b284b21boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616436}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90f56865d3c5afd9e7ee1d76e57dcf037dc55051
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12e7a5885b991a10914be0017f63a09c8c822db23c0114e98b300dd567bb6145
+size 8004450
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..30f8de7a651d731d878ec0293697786d52fef2cb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83d90e5c1d97fd5077c7fa34a017a74a3ca1ef74e6b0be4548c10eac320620a1
+size 4697479
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..33f6e38285ac1ffaf8703c69eb2fda8a24e7613c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dc530f41120018672f52bf6e9a47a1a3bf78c95a48536859470dc9f0a539121
+size 5532287
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dcc929b5e29c9b5506311908865eb4e4a6782cb4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98b5c6fc5de55a7714166f52d7c1170fc0924cc14f3b5575287dd9863cdacbbe
+size 12899948
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..952ef48a45c12b1aec56d942c053795d4b4d7bdb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cce695b06fc5cfe99a6bee2d78d7f0665d58c2b7ceb34514d64d8ac78bae1bd6
+size 14642170
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..95015f7347d06b23606a9959f58ecbbeb9e374c6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed9073f36d7c8d6f6a6c50813e32c976dcd976df46f61ea9cf0ea48a7328768d
+size 8208456
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60772b760cc9e12b1208f2c116a980b955d09261
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a85749a3f16f70b7d6251990adee5638c3b6034757b44e8e60086d60c54a6e52
+size 8705248
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bc191bb7928568d3a6fe2488bc2569a0cc6cb1c9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30b017cf66b1e5d92378aa774fc2ee0f888511b5c5d935c30217ff219e87dd9d
+size 3478821
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bfd87146575fdca6c7f936b44df9e1b8eaa779ed
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abe96c1460f87332d438b633d6738e9d820af56f01545c014513732e6625ad5b
+size 4193195
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b5bc20bcc9cc1c9d5e0498d9c07a97a2dc16c9b7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f947f30f362c94b687a2df5a7833f88bbc56cc0597fda245f5ffafc9c5c0806
+size 9743630
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2d87bb6cdc93793a538e1fce8acf6c21c4872b1d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98673e850a1ee0ebf805c25bdffe289021e128b13305293410ebe8609a6aac8f
+size 11165992
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90c39a45935653a66969af5711693d5c75ad051a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f0e72ccdf0ce060981d8dce095925e921944875d8445835e62c9307c04f392f
+size 6283958
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9351d19b9175ca763469541cc5fb29d94a3272be
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2441d60b4974dd596e8811a43532139e5558797ef4a49d4c630929f31833c53
+size 8873656
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b48e96897f87b2bd416bbc9d907dbef97e46e5b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:079c36f4599c3b356812f9081992e108e4b5b4dff978d2b1f0511b6d38410b77
+size 3231301
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e64198aec0c7d33fc86d242e4d61e66c2cb21d9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6429aa9d5dc799966e1bbd8eb2d91f9d3c2993bc4f51d9489244a1db648f38aa
+size 3832246
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a811a07a91fd60382046e969035c5dcbef89d6a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4381f2277d7c8621b28fb0d1df9b92fb702e6ecfd77c321285e8c83a0f7978e
+size 8960912
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20906e45e6d174db877089b94c090808242a6554
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8061d31b816641dcdf0c8ec69b8dc38857bdfd5cf41234b58f570107263aca73
+size 10245998
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4010253cf0d4dde3df5f2734d728fd6c6458dca1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4207214651996f217c3a38fa32ea1410f46bfe30b027e7b6aef4aeb4591fde37
+size 5778068
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fd3c5e1b05355c7a3beb697c874c6636dcc2afce
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e6297bec583e19491a8110fe1ef0bed0d5ba96bd7b5beb95297763a84b40790
+size 8255636
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f9c649fb86fd1ef06e2fdbe9414799642e8dec97
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2895c024b6e12b6b7c2b59900ebbd947b76a91137869a394346b0c18c9ab6a88
+size 3886719
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4ea6c21618c34743fee9f4169e8e54cc19329c43
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ec6fd25b28a7bf99f506aed543184cf148bbd8774a41195f52f72ec28a54f13
+size 4691214
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9bde95afadd5e6e204fdf8d8ef9a90cddd970c7b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5675e40a719b75a05b6ec4f7455c4a47e84bf4ac75c86301900c186251091db7
+size 10988002
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..082510e41130c72dc5c5fa541d4a043ebb8f7ec3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f30be14d4a5901c538e540185df80208db2f3c3aa761165e0c88ecab3d9defed
+size 12707502
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d348c0f4ab1a8347910e337091ec57a90268c85
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc764f184e8ff5e6f4f24f611c0655e5782811b10e8813e77df619ca357acaf2
+size 7202557
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c99e55a44914cfcbd45c7b1db1fe6f61aecfacc
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9519eb5fd368ad288eefeb352e0bef92811d47e2511bbe94354e7cd716ff5142
+size 10352816
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0650f8409669ee0653ba311afa397128969e5b47
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a24610a290926dae9c43b691de5c3d20baf5e3bd857af3486ee886c577c897a
+size 5456800
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..43e1ca0b2f835c4b3accc6c4a2e3e12cac8ffd89
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b14f8e45f5202e48ccd49b867cb8b6117a93766c08e943ca069d495ae8bbca5
+size 6626783
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e762c749da924e06ea9d0e2d7921f48fb3583f23
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be205f29ea1c2f3555967c51a0b8f09abf273f37749cf529263cc538178adea4
+size 15757898
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..88f1a4e3bddcd54ad38a4c754f783bf3aae90c44
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:986984407673e471e1d10f67c622f965fd5097ad594194db70bccce0686970d4
+size 18337084
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..34fd2069f599c50540862e70f36e63c902b4e9f8
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6f54a5d6a53fcc22e9f337bec09b2feb9f05b036f55a61cb4e3340c8dada516
+size 10484388
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07f0f8b1bd35bf63a3c84771ccc355707e638924
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e38383ad701e41fdddf5345647e6c2aa31a9d4b3eb1fd5838f9c00dd7e9d80df
+size 7546539
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d00d12d63c0936206bb9d848cceb1174398b230a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bdc5ae0f1b5fbe4a361e978dcb319cba43f89f6e4b901b95ab196f8bf96e9f7
+size 13417545
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0adfcf9aebf4b5d92d5f769dc561cbe1e021205
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b1ceaebd343dbff43c549fc226232ff23bf891fe83a76648d767533ee45d4c5
+size 19001234
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..942699e6cb7c770d5ee85acf7ceef574a6877662
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f741d018afb263fe2f09500c5d855f5f94a2895e7bf5ec81cf3984d4f8dc56
+size 48949506
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f112d2d9a6c5319c98077193d9b13042c01c352
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ddbc7386febb552ed19d8452d7419293cc41111ed9e4c7f8911167d14c351e
+size 59627502
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4a2ac5974cf7ef5eb27f468427164ae95cdb3836
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47d828b3f10482e67e5455212ac095ce4eea8358357e8543d90c799d5c387ce7
+size 35279900
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..94d770a50631df98a606392f7889f4e3132cf9c8
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a408f8d0cc6e42243c9d41a642ec6af7894de68ffe93b486640d2d3fc9fd0d57
+size 7762516
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b0c343c6468ae9b93150365c71ae674d4e4fe34
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:238709252a2f06a2e511ce50cdd578ff94fac2c9e167a254b62752aea054aeba
+size 13256494
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ee994b78533f417d6184a861cb2389d32c64eefb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41d07dd0993cadc18ccc1a956984e613dba1a876eeb71bf5ec27f1c81a7bbde9
+size 18849084
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6ff6feb8a8cb6ab78d66684166a9e6b36314120
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2135653fd089045b7c81055cfc36551f81bee4f8831d398c08f353cafa2d742
+size 48914968
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c68ffe61b39fba0c008a687c400bad8a08ebaa7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e750691a10e8f994ac283a83d27a336915eea77d61f71bd2015154a542bc1d2a
+size 59825018
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..67337d547fd640d56a8cce1da716f1562c97de66
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4357682153123caae28eb02c08309fdcc349a78c4ed2ce92c7489c9b572e3224
+size 35424157
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a75485d21d3fbda3f4c05020c3a3f8d5f77dc4fe
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:224a90021966ff8f8a4e926f3f874069c9160284aa0fa094e98c1ee94eec43b3
+size 7533904
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..578ba2fbfadb1a0c7dd9c040723dfccf1b93ee99
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4942fd6cd2bf46580901ca736c0d7ea93026a077da9e84b0831e89cbe72ed46f
+size 13370384
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1149946655317662d153e9f9a012f78fbca0d5a6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40dd03169a150f1670e8281b54d31fdc07719907b37ee47be98756588a6a693d
+size 19043348
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..56cc598e21c9feb82751f53f13c83ef650bb6c67
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71eaee9b64b73481c7c0d481e925c86d5b79f6d6c59e49df106dc7af95ecd74b
+size 49201896
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d12d451eb5d0d5d1aea3ec78b3a6daf1235e5ba7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ab7c58fadd6445a9438c514018e7ef90e9b25d2c4310cc4fcd48ec5359ec600
+size 60070490
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a2857b26743fad931d9c9b800694ca7e8122aaa
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1033ceab351b9e89a4e3f5e3857c8897137adb174752080eb855b389b97e9b3
+size 35569485
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..73215048ca2961bf79c82ca9d2d141dac3ada63f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efd31d0c3b8dc89ce8c8a35d968ba0c5028e1e58be7df95e76e9c157515468fd
+size 7360265
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d095df2300b00c8820454d9879d337b052dfc14d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75caa70f61ee03d6be7a9afe0f5ab9f7de746d08b537931809b99432cf256aa7
+size 13010048
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..505eba01e7faab1fc620cd3b907f4daa30a9fa74
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:807b03e72628420904d04fa34ceee7c9dd57112b94dc70dbfd1d5b87f50520bc
+size 18556171
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd82365d7389c8e0d6ef8b6ca59b92d93ba11ca9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fc4c5f6d0c8090fbca9b31c17293be5f24141f99ff9137b013fdbd0c666e06a
+size 48030058
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..753aadc5d0184cbce52b3df607c16b545709c939
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c29b6ff0c7d8ead66750d8ee09fc66ff1faec5bc5cc802d4d106167426e2bacc
+size 58721310
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6764a889bd4f818f05d9abdb3d90be2389e6832f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96c7f0b55164548e8b862c659423430b3263188aed0d9367afedd1be403d87e9
+size 34783770
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c146cb44fd626a7ee607ec11a878a5b1952104eb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:225a808729e0bf72bc2cda8c186d298eda0b3b03cf5a86c47d6bb9dfc16f41a0
+size 8151801
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b3b54204a72316217b49db9ff401e8c6b9d9163
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb5d794a3e2b9851c0fb20e1fc116b9324f08aec708744a3c3f7fc19ae0db963
+size 13778847
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba994dd7a7cda449e315841a8abb4ebc2eebaf80
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b695582092eede252dcb1e17bce2793fd570926592157f51086cbb1125da923
+size 19715450
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..96e00b1de969637a84ca38086f9ad471404b54e9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:167c7857345dca8d6824a04a788b3e9b3d111b112e9dfcad332210e287889308
+size 50652004
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4c17d1104532bc2310f66adaef447905fb04135a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a412cc30576892321ec8b31d0c633e311e1dd8bc690bf791df9d787e55804334
+size 61627722
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b54c6c90b44771249f44c375839d9a9d2c0a520
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5e814d0ed9e0de03435dae4613217e9ba352fd3af81159fc676abb98d32566
+size 36466336
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..317da5a9ba9e410d2f8ae3a30566df8f9b28c930
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61fe4219b6fac720b73f8c3f4a133c764c44a749c9d66d8d7da35128b01b50cd
+size 993114
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..21dcaae5efab92cb831e6fd55d0e9dcbdd6a178a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cdb13087e15325ea23b16a2139f77132a785441684fa191f88ff4fa8dd8251d
+size 1451261
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..905989d8e2c48a559e4085a2c8e445f583c341b3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29f3c660a1cb543cb39c393c4da3c57480a749b6e849274ec08c6048a922dac6
+size 1910101
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b0574017c289a8b8ccb2a1a5dadca826abb75a5d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e856001a93fae0d3c7116936229ccedb00b0d838a4a01ec9305333136d069cdb
+size 2367468
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..caa7756a5abd8fd36a22ebf5470e48107a0fb61b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc445c6accd99454e3e4f279d2f1fb2831e8dd483a7473d59f9964fea42a192c
+size 2821834
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60e34b4c00ef48e99b35954f35bef7146dc77f2c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1227620741ab009051a6f05937c16eb0bc3b79676bf1898f563d976d8f09cca0
+size 3278134
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d547a4de266bd3cf593ae5e569aaa2c566c9ecda
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:690fbff917364e834bd75674605aa260eadd58013b949c4593a3d5d2ab3599d8
+size 1203113
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1ce878e0baca92bdffc5e1e95712a14e4725e7e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:701bf2d17666b683084b60e3615d0beb15979b4cea0af420cfaf229c978efdd9
+size 1755006
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..696cd8f99610c254c50d5ede7794d6fcabf9c0dd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af3ca74186d23498dd1a93cf8cb1b62e4e72565002de27ee80de23b86e4f7ddf
+size 2305086
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9f5ad0968ca86b86e840b73193cb7c3b3a26e35
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa831c3c3dec458b519726eb2a72d1ffa0598f99393b6e5a71a9bd5f6ab451ff
+size 2853902
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..10ae7242212c1b6cc4dcdf7c075ff54523ec3d47
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5db2fcb3075b71602ea17f3e0b17c965b10f588892489b3fa7feb0e0c33996
+size 3399585
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b32130560e4489560ff8b6b9c787fc354a529b07
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da15ef61c234a7e6fd9da17d396bd0e0280bec3cdb4c80bdde8c4fd0acb50137
+size 3946983
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb169041edf46041c7f6b0b78d75c49b7ecf0830
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a3854d260d12edfc1db4af89bf473caaafa8453c14393fae5c3df70142c34e0
+size 1007757
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91
+size 1478640
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a2bd80dc1ebca3c5c6c0bc68f4c1ccd2631c31cb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b09351708f7aff4612c8b0a1a21ba54df7e6efb93eee57f890fb0f5a26a11140
+size 1949746
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a46a65e4fe4002106f256af91538cb895533213
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cdf835cf2c9c1af102f81620661dd851e9730da482222a7ecc5a11beac87f22
+size 2419359
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b6d5b5fa5a33e7251784aa48cf07f6b8a20cdb9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41fff6d9663fcd6d1cfbed37c42853a9bb886f9bf2c9a4e703ab11936c1eb3f9
+size 2885850
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ed8965cad768436669138c7d7a9e3441eddb5147
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eba46d25c106c1732c8365de6b641caf773576e90c7b6a918844add2f7990b6a
+size 3354329
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8bfdddc8f549113954df9de44cff28a70ca01e99
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6182f868259d0f0a703e77762a426789e60feede41bd394fd8a8f25acc48e145
+size 1160418
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ea62c1dae85ad1828a627168a732be5bab2adefe
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d537d5c6837d0d9bfafbdbaf2667d2e1dec1385eb6196aaee50bd8ac007839e7
+size 1668647
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d09f5440a08eb50a83d942de83795887bcd818aa
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:744372b1cca968a69c13ff0ce1499e4b3b4d8288fce2de6513c9326f6334b38a
+size 2177037
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7eb6ee0b28ab1c12b7a756a0ba150256edf1d31a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51edc23eaebf7f4148fa556fb6e6f4aba778b0a719dd11b2a1107d295d3c58a6
+size 2683727
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..84f64948bf4f5aa1e668679b512a957ed6c04f60
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94cb8dc10905668022c520738e57380947f18e0a10d0cde1cc6c50cbc177f3d8
+size 3187646
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..44e0311870f4eb042fb1b6ceaadbed741358b310
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44b1ad8fee49c5c98f6e631bb6cf177c0810de936c33f7aaa7a18a719b18f01c
+size 3693621
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b86008c6095728ee5b1650a4344820d3c9a1441
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3533ab259fa16f78bc5790e432867dda99488b9b034a8f9f65632611cfc1e28f
+size 1026579
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f1b2f7514c28b3d0e60f7619687b86e35612ea5e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd21b3162cacfa57437728e6abdd185de9efb3bad975aa59294b76c811a384f
+size 1503640
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2405b06814175a46147c1109df76e41db766da6f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:494af4c8babd22e272c0af146c8e5682011766e0ee3ace46a7f5746213876269
+size 1980802
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5451ea26b1397abef06be909da3f9ef4b231f6f8
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbdbf64f7588bcd115989700c8d62e06c9b2bd5ffb4fdc49e5ee4586a1eed882
+size 2456449
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ec06614e1ff1d1e1ca0815f9038c90d28cec068c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c4dc47b3f08f801d321d0c3ef1259ded7230ecef38d838c2568c6f20097067
+size 2929101
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8ccb9b91b43d65acc56a5d02ffc317666ba3f96
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r1_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4da395add110c4706ecdfc50ae80252cbc77fc128df1c800a516b26f49accfa1
+size 3403654
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e4fac418b01a90ce727affe9a59825917a610458
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:651aa4c9d3c93e0c7e8c7c6ebf4dccecf5606ec2014420296d6349a439d311e9
+size 993516
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f9b8c6e39c91c281f62592f5cdd651d994e2b8ac
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b74f26e2f53a20df9aa6953a006558bd5d5527f854e3c97277bb5c6cee78174f
+size 1446359
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..abd9a25044ab4749dc9c41a7ba88e033a80be19f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17b703af42547d0e5dfc83975a61ea2d259c2dcde7619447501d88f84db2b692
+size 1899988
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..48c6b731003195e05fbda91f5102896b25ff22cd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89ee89eccf992d92d18d78967d183610cf989c7f8dca2dade3c0219c133a8bf8
+size 2349885
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2ff88471d5d34ac106d15b2fb3c7af44cde5e8b1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2164264b14e1d63101ae0076d1eabfa1f527270ec34caa4dcd7db895c7e651ae
+size 2799609
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d1353a3a004d961b62298aea1324a8a513243c1d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5564111a4a4f8bc47ba0cd800b3a2da5e732d764768d6f10b79b274d3c4e5126
+size 3250256
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0968a3459a2578a8842e4f926a242aa3081140c1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e378674ca5199bd0112cb2d2c16e176d53cdf2109b7055f90aba5cedc42731b
+size 1203517
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79b9effc40be6f7a0ec61081d035d801ea9e90ac
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6600d531936e7ef70031daa10256ab0157ee2e4db6c4008c3be3cb0877b09e23
+size 1750064
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f7ecf391858c042aa61ed86e863f81660b535e6a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cb013eef604b1374528c50bf667cec8cf22432a3b48bc860723b98c365db85e
+size 2295200
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..663ad66e60097f42b371df77176a18b95857100f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cedec0d517460ed172d816bacf9479ab3b2bc9d43903eb4240b9ae5d7cac46f6
+size 2836620
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c6f6f9ff7995a123589b710ed8248cd4276611b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b5090317e5d77e8adb09fd3dee2137185a8b54c9504562be7b81b666e04e9a6
+size 3377612
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cbefafe3afdb4f6d96b2efae6240086bb1332692
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e53f74879842b7290ad26190de2d61d63d77c8a1c96a0d89ed414d04c4757a93
+size 3919375
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..724a05a5446a92907737e96ff3f351f635d24220
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ab2a0458390e259a7fc5ae5f9c235ccd93c838c5cc278c15c3f7c525a68364c
+size 1008163
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5
+size 1474064
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ad2a3dd67ad3a227633bcc7b48981653c387523c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:291c3fca0be3d47860422c8fd82f917521684a81673ec06cd8a6eaaf7964e35b
+size 1939964
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06a182d787a531873595594ec83e9732073776e5
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e51ad6818a101aa2234df7349aaf61dff452994e6faaa6be14a4dbeeec542b1b
+size 2402038
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0103d41ae2c15a3e80a4b019e9d4df9446f50c93
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9456db346cb7a02b3742c862d17b4ac3eb2ea5ce625e9dc6f8847d3782f3e680
+size 2863903
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..28e2ffd4ae4e69c69d7e4cafc8bbcf3d96de98b7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb4d646f031d27016cc91eedf669440427d9104c42050c679ad9915087306818
+size 3326644
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..746acda05345e1d5621c0f3f0e6dd1a3fb1bc93a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a125f621fec86e0803ca2c93c7db280e42f30a8aaea405e247710415f6f160e1
+size 1160838
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02cc4d8e6a3692f51dae79a6e440318d6ddd35a6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3737da7224265c2db5b7bfa5c03a1719c6f94ad2d45312502d70995200e9386f
+size 1663799
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..352176fde7caf268b509280719968f2f07aec754
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a02fb69772220e3ceb917100adbcd887566f86aae86fc129612c481a5e7b1a0
+size 2166540
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..65e4d6c57c1018eee2af17894c194ab9f767a016
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1414d10cc9c8c15c7d878f868ef5b43b715205080f86d759d4d61c780a600792
+size 2665655
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ba8b97b3be07ac66a3216566559e38fad327257
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ba79d579d006d9ab95ea9a147838fc12bc127688426275c43def76f4603f7ec
+size 3164709
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..66f0eb0b8bcd74b6c8f375800eebdf9b9533ad8c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ce0178bf143451e4aef1baf657405acdfce7ae58045c1b840391208e9db843a
+size 3664910
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..24877e119c624fa298a0d65f269dd0d8309235b6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9967986efdff2f9f0a0f76d00ca03cd5c8968be27da8f1a79ee36c08dea2bf14
+size 1027012
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b
+size 1499064
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..af362041b9f128400f5b66128f528e402aef399c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2181a3fd14df9dde91a4cc63d9ecdb3f3ceac78e212d3fb7c92845ba32b334d8
+size 1971022
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..24118b3dedda474f81216bb5f9e9dd7cd3e47470
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b47270806a9bcc6c5f601b3112b3db970c7076afb8fc579d2920244237e85ab3
+size 2439136
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f66562f8a5a9224114bb8c2996443f34a8668
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d982fc1b2d49c3dc1f48a538ad944b50dc2aa82773b814df515a3e6b74a5930
+size 2907123
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..19c2788d9b9c8a65ebd7e80076eef39c7fab19a1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r2_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39f5efbff2559d3fce1c0a6cb4bf16f1010dca290172c8b69d6cca95c01a98b7
+size 3375952
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d8cb38e60936ec460f1fafa58c53ca5d8f43e3de
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e198e3e67196427a12ff644268cb7592a79a5d46883a11ec5d92d65e4cbe7ea
+size 1169161
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d8532e8a7cd38a2b1abae5882ccb0705245362a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af8cd136c5dfcefa8a2a8188158214c9a620e6d5b82ce09446d2206f32c98482
+size 1697887
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0847ed508ad29bf71600c10a191ab5b702a6e8ae
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:490a38eac4bedab597db4ce9fd33455fcf13d63eb1ed3e63dc0a548102e836c0
+size 2216991
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da24c7a6d3ebde0e01f50bbdc296a8ff7a18c1a3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a0b8aa83c39bc5a4133dc717f553929b0e4598fb29dfc9947e1d9d0bd73c28d
+size 2730410
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9ef90edb5bf23f1499668dd578663400292a8bb6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b117e82e7d8303a63470fcc35248f6f86b99d325aaa7785734f6777e7bda2cd
+size 3247878
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..74ff4868c47db020b04aa1c3434af57eb5b11b42
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63d4d6829c246ee578afdfaa0ec6fbcddc1f28e44b90f2dc289c9fd1647dcc37
+size 3776443
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..979682c3caa5edce967d5750ff2afc7ba7e9697c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:804697655d7e8d9bee8b4aa60c84fe05ce7085b0dd033072c176e05272c8267e
+size 1421155
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe3c0c77f0fd332fb2b9aebbc2f2949d79b9d340
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8436189e45f167b68f379e1b616cda810a1807bc1f8dbcf03908bfc6beea0535
+size 2061859
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fa3d0e1b5612468b98445502fc23c4cbf831e635
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c30f4194d3c4c23d03bfdda8d74979946c769c7869536b23fa12f32ad929b963
+size 2690674
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d2e547442d3130db7fc6406a960beef1153c104c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45f561468eb6bdbe3c90b4c712f160c9e1bd782253918de9d3bc0999b6e67964
+size 3313708
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0807c3fc09da31201de98d7862cc22b3b1afa68e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c1e1c9bdfdeb3da4eb53755f38b9c6445ef8d351d9281e8da8bae1f52e1392
+size 3940767
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8fa6c0b73eb88becdc9714d04bb18236116a67e2
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a80fecb32eb9956f5f5dab854d8689cbb5330f33646c98c8697d839436cf17e4
+size 4578608
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9c6592a9209b7358c1958121d085882e57788853
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fbf01c944d7afc76390c302e43c320d998c2468562378beac3f62f6c0c13374
+size 1186814
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778
+size 1730743
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3cbf061dc2b15773f2feb095b51cc00367d514a7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9591d924c4231ae22e53563dcd2ba03df35843e9dcc0afdc68fdad6361230bdb
+size 2264715
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4adc0cbed819567a6105ee36a941efe44da26d17
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:737b1871eb28bdeffc2df08d910d5862ff7b04294c9b635e38456adfe1307ff3
+size 2792706
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6779a0444889f57986436d3b53b124c7442e215a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73b2b4f044ffff2323c2338848fbbf8b5b8d41d797d045bc1e9fad11c5002154
+size 3324817
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd8d8e65bfa8d6cb3c649ef2c72f2b7e51540d00
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0762384d1fc3ce5076b791154d149446661adb05d1d09e83dea8495b66fdf6b0
+size 3867848
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..887511882a5be2953577f2b761511fbddc770b6c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50a40c5bb1605f0936efd31091e5bc7bc234d2d7e2a1ff98264fbec52505cfc7
+size 1370811
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c69a1165bac067382c6b6dd3df3b86e4a96a3665
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1b8a58ad8d7fa778e66bbbd03d5db93593ef7f435c00cb0ce09ecdcb48a48d
+size 1959139
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..88f93b1ec869c20c82d7bd981c2674ed2c1c4409
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aed6e6419af97a668bff153c620f7fca7038ae3eb1fb5b26fdc22b6362666fa7
+size 2537634
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b2a176bbbac4942697dc2a02bec6cc12e4de07b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eede3f5ff29ee025a6b64e5b3ff16b49f5439c71ff8f68dd97c929d587f29e9c
+size 3110565
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..afc5f07fca33218b75b3cb3ee77688bd4f904170
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:702dfadcb4fe701c77569b6e7265cab1a6ee1f18bc8c4bd4863180ab50771364
+size 3687782
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..21fe5b57acc2299b9a3357f381d90dc3ae54d831
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9baab225498de11f36e45b1685839e1d3c43cb5a1814c71cbd5d2b5ea51136e7
+size 4275933
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1cd1f1b6c596d6265e3a602bd3adfa3dbd15e309
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0da6eb10ba4f912be618574145bfd1a0c6a0c1beb06d6fff77a6ea90d074554c
+size 1209479
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b5767a0e5f20d87cb0bb084ba367fc3b4d74a7b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4596e0ae0344f2dab5a70c5082144e6243e30c4dd23f48a344d82bec37156f6
+size 1760744
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b20a25244e4f1d31080ceea7a033873bedc1f3f4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70e547c5a537782affeecb55b98431c9489c86586da0b15d4d356100ff02c350
+size 2301985
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2610eb10bfd1c4bff3cd4a6270c160c93f611bb1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb11439909341b1e0d256ae6864fa58c4e20f83d4d49e7304f0e7a34ead563e1
+size 2837254
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b15f82309bdb36bf9c91b9e4be8d7789116f8b88
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cffb8a2228b5e1f83f9f7c27d3d1b0feca9a1e510d6e59ef77ac22b5107100a8
+size 3376650
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e0695adead03f3b9c920ab7e4b3cefd2190d4de
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_anli_r3_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e421784e4b6206bd0002467ce644a187bacb368e338660d3eb9362a248f4a010
+size 3926937
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..94d22251d4ef57c49024acff83bd427b26aef7ca
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5e1952180e1f1c35a663b2582ad755a878d5cee36770442902d273aa99c51bf
+size 1216777
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4000733d1bdfa4b2abd57016051e66b8b7404a43
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:712ebe6d6fd58ffa28e7aacdef484ca4f64e0a02c1e7a6558d398e816101fcce
+size 1670632
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20d86286eacedea5c28e2b5ada9d444a28bb0031
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d1c0e35288a564629519f52f166ef869d717a984e3df96f306f1ea040e90ba4
+size 2120084
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..40c1273c0c569485371d791846741d78ebbcdbe1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4ee20908ccf741170a23807d08ecbad324b4fd648740f43f12dcd8e0cad3e0a
+size 2577615
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1e6d2a68a28b8d05838ef71e176b41401593bd3b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a66e929fc6091ea43668da8e5f6a40b097f600f40bff56d68b102d30cc6592e2
+size 3027517
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f132a90d578d3568f7432f356c76f553e99baafa
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb7bec8e8cfd6d91d8a200948afccfd3d81624a2f7732e72938a1ac02d820907
+size 3479076
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9c9933ce0fef4533bf2ce8c19ae8fa1e495d5854
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a217fbaee6ca59fe047b0fa49aa9ec551f95730e385d607516740af066976b8
+size 1458251
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b2668aed518c927fcdbfbb71fa91865e705012b7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69eb2a50eb0fc99078b196ead4f6301ecd60ad2abff0759642d225b53cf50f
+size 1960890
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de5511b78fbb281885c7ae31dcc15f4ecdd4d6c7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0bc2c3eab49b13d1cf70b474237bf204a9aab29a49249dad2d44dfc7ef69954
+size 2457665
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9a69b8ee2748ca468392df352649b3cde49ed7b5
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6cfb5b25184c4d99347dfe0a7f3d6048a19ad2c4ab376673882c9a84c789612
+size 2963370
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7933bbb924d468a724ffe40b7617f6904a68b19c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:220f635fa39d14c7241cf16a6746aa81d03b5d017f4f0e66d06b0f86f3822070
+size 3461513
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6c4f758d6a9bbddd4a431bc64a0889bd4dc1ac6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a21b5459c8b2383ded69df4c0b01b5b10a823297498b634e0612e7e38b09e7ea
+size 3960695
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c02fdfba2c711334a090bad6afdfd57936c830a9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b50ee737c215fa5ccd661619881d5bbe99c02b99fd621d013bf72d35db6febe9
+size 1506391
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4bc49e0edd258f2d9152514270e40d209b4abc47
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:723253e94d29e9cc315f6defa1935cb4dfa44de8a7f58ad17c8099a33bab485d
+size 2032811
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f7deede16830dd086a0215f2162782698835c5ee
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c36d90d1edc501888c1a20f964d8b21c30ac31d158a34621b78af585e4908133
+size 2554309
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e476cb9ba11c6b55972ff1416a993a7cb4e48029
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68596823d67ca2cdedf436f26d9eb008a21c6667811f9999106b7eb56263228a
+size 3084775
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e62a85a522fc340b9529b19a5aabeb2f710e9e7c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0224b6038d27ba2a03d035ff4d6a9d91f1dad9d2a92efc1d3d91632e4f1fdb6a
+size 3607332
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..95e2bdc6d8f1629477e0b05a04a27b04434e4d0f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:319ecfdece8644b3dc6c5bf8a3a7cddfcb2a1bb1ac1f962b568db4559742ef6e
+size 4131055
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f5b43574e5f7b43f9d50e521669ae8427675a64
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:493b60d86abb764d755fb79ba0e40d823f1b28cb69d65dbb55eddcfeaafb671b
+size 1202714
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..01ea17439ef89764662dfe8489a450bd439e9c90
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5f8d3d29b5dfd918ec8e667adb6bdb5e8e674e703e35c0094ed2784eec65551
+size 1638992
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..904e238c4d22746886e7df7ec4c9f807bc14a72c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9d7f81dfae63efb3f8fb18304a95db7bff35a990001dfacec368229628ec688
+size 2070864
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..99194ea363afe6484f03994f89021491d2908082
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:066c83f77a43d2cdd4e047453f56b01f081b2c8138b76af694b89a866b14cd89
+size 2510815
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..38003a36c2a772e07bbc7f2e8253f4c59a496b87
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a24aa213382cf0b99abb518b80e406a06b9d3f81804d13adad4e4cfb4b7f886a
+size 2943137
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06080cbb0ce383e76f2d3a013bd6f3261c4b81c4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54a507ea0d49ee415d2d8d0c28fefd8d645aceb49bf4f83662609c2584089440
+size 3377116
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..48d29fc87cdcea4e39bccd41f26c7510489dc641
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6216a2d59a65c9919392e44f704ca8392fe8d159c53ca9ff450caad3d9c98436
+size 1187482
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a111b4c39e87bb00ddd77ab7f2188c1a11272de9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0faba40301cfd01c9a13c58ff367af4d599fa52e6e9fe86d666b7f0fbb3cf291
+size 1557821
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae67193f865017a143c5c05f821546aa8d13160e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9417473dfee008ddd5cb35d9bf887549228a65b51a57399ac363c23978e35a52
+size 1922019
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71d590ac8603f1aa32d356fae0ffdc5fa5e97abf
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0aedc7f09bafba18c01ca142afcddae4cdc25084bbb2990ee2924049f71c18d4
+size 2295321
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9fa891ab4bdd45ef6d67cfaaf03d5d5b49b60e7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26179dbef7ac0cc6f50ae655ed4d1dfd6fba6e5905223fc7574c65f8793601ff
+size 2660829
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5352c89aa0ad14dc0859fb35d7d7360c7078db1e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_challenge_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8d8a02f9f9a481355e231e76bcf2d160235b1f6fd2719f8e92f91c5636fa44f
+size 3027678
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..612551ff045fe0be9b98ff300436c363ab7dd8f3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cb12b4edb1bc611d072b193b4a4786d386cfcadbbbdf86662c6b7e50bd4be9c
+size 2351241
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1aaca3fac6f5a16a418814fd2382954a8838ecf5
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da5e268144962ec4a4306b88f3380a4176c0ea5429ebf276dd684fdb4e387a99
+size 3174164
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c0af0b3885c04a6709571f31070a9f16f88a364
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5687d7a49f8e72bbfe2c8e990a7bc26e4c6f3af11dd32d7a956282b4e4e2625d
+size 4008946
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a8846348b7c6dda1752e9b059856a5c71a16e27
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc1ea8a8af0d3f3b0acb22de4341f754672a721b9e27027211c1eb9c3d829b5d
+size 4831612
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0eb7bc01de2a7c27f9960e8363ae686df0fdc443
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f31ac7801069cc028eb6e8284ec2fc9ac7b028bf8fe30740628798bfdaa7141
+size 5662356
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..41d714b26bb6f3082cb469db16416cacb581854d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1262e33dfa039005aab5e1563d47bf2ad22e6574bdc451ee9665fd7fda6b0a7a
+size 6494729
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..696fb987310c97200ad781f9219dc5e1540869ee
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09e1c3410e79c61bf7368518757dd3e956a3856fc4b8c3bd5d7cfc716014cccf
+size 2745883
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..464f261b3489da644c79af688721b7ca22721d0b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:567d22cd36402a5bebe40f19c00d20f8b0a8886ba3765854cb8344b50d352139
+size 3649051
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b5353059449fefec8be63249447b00751f595759
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89b23bf6c0bf830259faf4a6fe9f9c654008c3ce74d26e1da547583e8929f538
+size 4566938
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8df5fa68e3afee7a4260958fc0d6ff6d9934d662
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81500528c253c4ce4f573dad4cbc21c764843ba8a5fd85138335874d8c614ab4
+size 5470372
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20faf23b1f52291c4f1cf290510cbafe69fa097e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b32531d7bad1f9b610e3ca0f73aaa9470482073b60255f64c8334ad809be677
+size 6382888
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bd2dfd0b06005259861f519aeab8b5b2242dc92e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0b7755c6f809046e5a310b35ce5d2ecf9dd8e218f0da30a3171ba66ae47adc0
+size 7295174
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00631ba3abe94d1017f9e23f22fe2a1721b21703
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8ab89ade2de3f63ba8752f200385d3e9c4096df02bf5ec2c241e32d1ae46d4
+size 2843962
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3013ba1d5923abf7625ac087799fe231f12dc575
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9487e7cfb3a0404283c8e986d5c3c8fbc446145497cb7e9a33559a3130dc14ea
+size 3795468
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e4a40707986220acdaec35a58bd6791e6b8357b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:268a2253481b7d8de6b49429bd4cfd5b5ba1f9f71c0a135eee000d30a5359886
+size 4763467
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..27fa7309ce8fbe8f7b44df3e693b25b76dd0473a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fde4155aac7cd5aec9cb72f21f937c0afa3c42e106f566b605cfc39575880c9
+size 5716850
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b09f1e1eb20bb26c7a1172cae269f276eebd3778
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:476e1a3280fd26f4da40923ea3718127a36292821a97947c7d05c156629737f7
+size 6678953
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2783898796423f1c5fee772156a0d11bb7d6e800
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25844fe7b5873eaaf84aa2559cf8103d957fe7af9dcc013e3d0e434b275a84cd
+size 7641431
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14a821219c6034d91a8fb79b0e5ff100238a0568
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a11d68593515256e69c1e4b9065aba56e58c8f0a286e4eee13a77edd65f1f1a
+size 2322732
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..15a1540a141126ffdc553e9d71901c277b632f11
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ec47c2de3082b7ecc9e2141d19ccee31b2ce3216640a2c646c3def45a4e00cf
+size 3110021
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..429f9a82ccaca63a77ae4f6afa9d452e9e45e003
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9a0aec547f34a1b2c272f51ef92a85aff65999cd3ee951c4f079b42ddec3d7a
+size 3909160
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e77af45582bccdaf43912a43d8eb90e58040224
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9724e57e9aef048aa614a1c3a7351070c42b13e89cbd069cee7ad0ec6aebee4
+size 4696184
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eeb00924db558b3fa4f1fd37c3be83db3e7a80bf
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc64615f6edde9bdde032ca7197dff5c754fea87103b5cdd1477e5d6df1f0b8
+size 5491290
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d4e30cc5cf1cccbd8b7ae26332b3242f267bb2bd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfb9d27b7114b4a17be4d2e70c56efe5b4444a64a2b33ce00b44ef0a5bbefa79
+size 6288023
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4760b7e688b822f4e67ba6bca74b875e97501be6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8efaf2c755ff4f58efca8f3bd36150fb69e1cfea595d046b532c721f682a6def
+size 2196975
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45fd4dc7778a06906729a28277bb339b251db88b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41a6d39c0f0daf11a477fd8199f6263f16763bf11fddf167097280dfc6b2dfc7
+size 2831705
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ce81b6b91b77b402aa377c033e9cd8cecfb5e971
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a05239bcd7547ac677e97967278582af88884a20509b9c5b7cf461f04d191163
+size 3480827
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45bb87a9ca9846b41430d590db5b8f92c3ab5e54
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbec32bf4ff2893d532d014228c392fef88f2a22549baf0cb129fd6ec8b121cb
+size 4115953
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e62a7ad7072684fe3b23b401d4d0e1df0d565e9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eda34840a3544c3a0557e5a3dd9f506fd3638f78f2fb6e9b2b6904edc750ce5
+size 4760207
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b855da37f798a8fa9c785a9135c72543a1e45017
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_arc_easy_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fc4ba4066fe74f1a437274310c14c12239d2578e64b1f8e524c9ae3f671ffa1
+size 5404046
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..91f7e9dc1de6777aef3848940c3bebd6caa2d90c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76921df0402087e4b753531dd9b605c952046129e99de11cc795ce6233791514
+size 3642531
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3617f9467ce9a6af9f57f4562f8afdbb6eb990f7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f634fea708cb04925029b4fa094087154a0f35b7faf22a32222ed4f31a04565
+size 5657412
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9562e8b9781da4ecff580f022fefaefb246a4e51
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2567bd9d5e40fcdba8695382ee462089a87848b8359cb1044926c5e36b02180a
+size 7694413
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..081faca32c23ffae0fb6bb1776441e8fe20cbb4b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51716401eb864d236bdf2f2d2d73d9103878de135d7a3be14ef15e837a5c0942
+size 9728462
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dce18a51488f6e27a8a95aa0b21ee813b8aa5843
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:944328b46e039383e722adbe885740b6d1e99b2b26d53cb3b0b8a3e3d927898a
+size 11769069
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bc2eda1f0ee43b7748730aab439739c00bccf688
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_GPT-3-Style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de8080e00e47c722f95b4e742ee430d7c43ce8e00272df7134562ada7a1cd7b7
+size 13790531
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da8c5b590b0528d2a85723ce5c27e78034d64bb7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd929a63eb499481f1db6ebd617d1fabc32d73332e5264d123e575484ba0484c
+size 3984656
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5f69b122ea68699bbfccf653719d77631fbb9111
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd9ffe69fa7c24001c5ad25495d4fbe2ec4a833b9766593412f724f6d1cf4860
+size 6168139
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a770af759d970ad6f67e504260ee02e8089c5fd1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff6a949aead3180236ece3c972a807073713094340abe902a6bda28a6be7bc2a
+size 8372306
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca317bcd50825bde5b70296b071f37f2c046f6a6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:280bf88a9a4c238236f2d480cbe358c928f2596fa20cfd679acd1eba05787c80
+size 10573390
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ab023d5eda383defb092790ec49b0cadcd09277
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8fe46f2889f6b56fb963032bbc108dc8821ea38c81224c963170c96070b2350
+size 12781144
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e7234c2aae6f0cd010324c42f06e100fe2e7f68
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_after_reading_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e977d9d30eac7bc8ecab2a683bb7c1813cb96bcc540d28c73f757694bac9928d
+size 14969899
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d40d207590fe585e21fa97aee4bb270de38833f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60753d451d8b4c4fbee9fecf9a14420c3d6d35449c9c791d1b72de95f2ca152c
+size 4041656
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e5e8df9eae1f9f9549a327be53ba6ec381489473
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4ebb29dee0662a796fab9d3d771094e3c7030eb5f62dcfc8628e1ea2ae1de15
+size 6261080
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..082293c6e395b4c2d1ae80be8d5a3447b7050463
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89d27676d2f60f98c827f3b64030d54fb3e9a763b1d9197743055cd8ae121855
+size 8501271
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..299fe25f653b07080443ead5e7503895717a9f60
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f475595364ec6a081d87e22b63024d02bc056024e8c9f7e44a2e9e83b75f1a3d
+size 10738668
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a278214825731e02ad828d2e0c89e4d945f489f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d458b96285556ed6c56d674d27d8d7bc1cb288dd7db88457db0fe3674d6d519a
+size 12982527
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3f61d3d0ee8c0c7d800233adf1124e4d012f63bd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_exercise_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37249bbff5f925939a3040475cb59adf87a19abc5344e896ad0d328dd41b154b
+size 15207344
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7fcdcc7896b47a7253741fc227dc76bc154c274f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37c30d15a30f3d1700e171e970c9e85ce5acbd73cb4602a45f6c20ba54e02ee4
+size 3663691
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1148fa66443f8aad35d834fb2988c702ae872614
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fed81d173c404bab7100a956097a1dde2c502be618e25e6561debf3f37ccaa77
+size 5688134
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c462a68ef761358cd56eab422d452edbab5afc9d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4a137c2293b3c82cd17b63f9aa3a14a544457f1dce5b30fcd91fc8019cb67c
+size 7733338
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a79d8c39ed32656b0690678d1b779897cd017593
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3f5dac85e9a331473fa358d0817014c5eead7fbe1e1e02bb98d111fb69b9abb
+size 9775545
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b3ba62397e1e2523c60fd1f5dfece19a733f37fe
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad82f5b0040e83f29753019668c256cdc9179e1fa32e655f7bd074fe3aa472e8
+size 11824354
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..108abb914049d41e5ed2c54d8d9f11e1ac9de7df
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_valid_binary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61917ce6c95aced5104ee0569d91140345f5fda27c4d1ab185972b89e8e2268f
+size 13854113
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..38939e2379b4cf6cc9b246199d9951098736c992
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:610f0acc8d73a319a0b56586a040a00f0972f05d625f689c8b55228f8a527797
+size 3862397
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7715374b4348ad1785b39bb4526e8de1da6415c0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ef386a4fc84b20d97b20d7ecaebf02c38a079b0d953914e8dbdd4b2e89e1550
+size 5990492
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4a4399ff09ddc3e00849002f53c6f864c5003384
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:764d716bce673caad9639093463eddf25304d74d1ef0b54c45b270dc3ce596b1
+size 8141730
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d62c0811ed9f10486a408b555db4f392c2c3bf4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2e615a289b58a31779890391b70e64e27283b22a31d9b9efd72728127f12a4f
+size 10289822
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..748bad353cfc3c36a5824fc9f2fb598c900026b3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:233b280a034e359ac52aa520c5bcd934ba4cf9354849d3ef7b44c68d4b6624b7
+size 12444408
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e022257bb3f3bfed20c5523aa5caf27365490c74
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_boolq_yes_no_question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de3750469319f45f44b6e54cf0dbbd5f18019e62f9d59ad61aa39b692335ed54
+size 14579901
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d96a99663a32fc932eaeeccacc7462560e2e5877
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07d6126480f6c7250b854a223be1446b3fbc839e24f5a67e46dea97a3fc38418
+size 55148
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90d25f4c70e5343cdc810cc97ef81f9789551876
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ea9fb6ba06bdbcb47544ae0943531820acf2488905a8e3a489cdf9ca7168fba
+size 77967
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d49fff67b16d39ef6600a1450f0d97dd3e54d293
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6520493c0d223407476a3080804b3b4c5036e8c5fd13dbbdd91772a86aa5577
+size 99579
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2a0ce69fba61f1e61b7f26a7fab72e6d1d8a7c15
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d4e9bc3b10b289a3930db3d20acdef6497e98580c5b9946aecadc7fd2727193
+size 120727
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..015bbf68baa8f73aa5b9570e070f65471b1f8929
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acfc37072b6c973075ab9d9adb3e4e50497af96dc2c81cc17dfe85fcbc65a321
+size 142848
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..99a74e5b9a554e04eadcee461a65b47135b6576b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d690f3c2ba13182c2a7e361449ed57863fcc91d12f9b759364028c753396ae8a
+size 163671
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..806b11a9c8b43716ddc929108e3243bee2b45fa9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b09d06d81d146b2e5697d769fd76799731df2c2d70081347d8168f300891d6b8
+size 66218
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c
+size 94141
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c123e9a2d54c4b3e87f9b6ac799b7a2c0ea56459
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66ce3f2465180c146821fdcaab3ecbe21eb647d370d89b795df882835df7bf3d
+size 120858
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0be08feb387a325c3261476e2e45708db4560adb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6566e95d8e63cca39fa063dd69d923f8823a60490ea186939501c1460ad5c020
+size 147108
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a5f74f9d0d5254f38c67a381cd80f877c10a4c84
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0646740c2fdd16ec0ca98945e4cbb2872155576669fd63e414a925fefc16cfc
+size 174312
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a3338a0ac7b79751738ef98d50c7335fb98e086f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5ffb260f096b513ee71fe02d94223bf4b16c4cd223b1d728f22f72699c00316
+size 200215
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11dd7b815ee3b7dfa2ad0697aadb96e8f2cbb566
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:414577fa6ad0f178881560df60a8fa8ebbe9bdd395af454c3fcb3b7137ee4a07
+size 56300
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d
+size 79780
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..01cd5f5fa9adbe33c28608c5ea459ee3c03351f4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8fdbfabce2b76c158a7834e200af1779c8dcab850b0f1076fe817653a49a9fe
+size 102066
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..46c9ea2d9fdd0c30ffab907bd15caafc496640e1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afdc83fa4124c367d1fc689c6ba841c6bd177e008f6a87d6da0a0a88393e4546
+size 123872
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e2775f725d8537ec37154453da472fc3b3c8b6e4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a51970fed43eb35b1ec6779bea2fe9558bc5f6ac322b9a21b69668698747716c
+size 146654
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e2a281d74553695a33673958bb8945fd050fffa
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16db7fdedc860ff0006d8909a27aec4183b08358384515fbb7b7664b208a181a
+size 168140
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8d48e61e956b594dac29127096f587061efaa202
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb5a04f88870e7986c9248edf8c878b7a95e7a855e65c417a9042a5f80474c9
+size 63995
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ec6b3a0dc6c0a12368729ad2cc0a0dafefcd99fa
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c7241a9a496d980c7b178a8f0e6faba723fb2764d4c3b182c4f69da3b21c06b
+size 89650
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8f65276916de367ed48816951085cfd3ed78c7c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45c2c0411c4a93a116b3c52f2a2555be927d240bb4c4cafa81387f39bacb59ba
+size 114139
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b56e8a47e7f456a0be678dd3c6b0be18a4b9cc3b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:703418a469c75a6f0d4a03f3610366f171c3429cc4b21198ffd7e3adafae3bc1
+size 138166
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..16ffb2c43c3fa2d6b702a9c4d560cfdf07b4ec5d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:784e3ce7adcb83c1c1f21af5e8f048bd31745ce78c3a19a71d175d42509a304a
+size 163156
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ddb9ea410bfc7e73b6750375c6c03ffa6970f090
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9628db5ba41fa07ccce7a1f02b9fa861a99539e35d514aa6650262dea233150
+size 186832
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e60792bd90a2a070ff725ab4c503ab6d663ed4ec
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b652cc4c44ced23f30d177dd9069c1031ab43b735f6d8280339ec17a85a667af
+size 57316
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05
+size 81124
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e1230b6fcf78c38e4cebb6097999c563189a0905
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18799f950ee8df334e9224593b684def1c7ae8267071d84f4e1ba0c284d01980
+size 103747
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1ba38e9254957d7d4499783f53b5dc9e494a919d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daeddfaa49416442d32ecf42cced92e16ade6a682eb85ef2bc10adeb13ad4d66
+size 125888
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e7ac5d6fc79f8764f24fb4f63789b4ae0d32eeb7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a204049975fd82bdc8dacfde5ffe478ef9e3a327c9935dd81b390466c3121fdd
+size 149011
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a3121f4814ba65e398f7286154989076ab8bd606
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_cb_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a080c93f83bdfc3164ce741e6a3417ce2a1b546420e99d2ae1d6bb0930fe8be8
+size 170829
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0fd14d51a43333d61dcdc71d108e333ccb1064c0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60f8a21bbc2ff9d5c7939ef70f00fef56653b163ff609d3b1957664bd507e470
+size 92271
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5ea698812e48879103915f46d8f1671462bfd97
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c47f7656d75ffbab83dcff8a5e4732aaa0782431f1ad53f00d6c0333024dd699
+size 111563
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f3bceeafba8f0f67c66ca6f834cf683ac8f50611
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24fd3ef334bd9dd40311d88f2459683aea652755e0caba6f1b1c0cb70271e68c
+size 132116
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f26b329e14e1b064c17e39aa715aecd818dd25da
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9345438260a59bc3f14537c9e932c6cc309127f5f2931be4cee30237ed3047d
+size 152236
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f78da68afda4cc13eccc98db1d2faf63dfbe7961
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:873d297f80458700e5a0333088f1e3137ae07accdc7ac280e5ee28c0549b0021
+size 172064
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f37f937edeedb347396e1422c3e7467f1ccd0208
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_best_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c1fef00bac9075b12c10475f3efff8689c488a776a82f88b9fa511944e8ba5f
+size 191989
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f4cb9b3ac0f37ec6dc9e634478e27700ea3cf87
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2adf2b1cf1fd52891d1ffdb17695924a1f3b31dfa0a49f0a9661f4a3b844486
+size 87855
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ddab2426b0f108c0d6743ebf669b384f13c479b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5af177a23158fc0980627aa23319db20f17738d5d7b2d666bebb36ad0fa4e0
+size 105051
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c174022f1ec0ffca93893f32a6b4696b74178d1f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ded5e8bbce6bcd04ef85e02b2f1c320796885a185f77c0c1d4d217f8f41fd333
+size 123428
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f49276adf7cffd0984c11709ca51b344e883ef2b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c77ce2cd13b7e24051f80b05f8d2d72b4dc48a835a8f2342b418dbf22356a6d4
+size 141417
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c4d60c2e700012f0a665acc4f551dd2c9d355e1d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0cbf0859268f04a1107e8162ac82050b39a82b203e21faec65385074ad2ee50
+size 159083
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..999a0c1622c3123efa11b5543038484ff573ac20
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_cause_effect_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac3e6a079935167f529aeedf625c066ae21e522e2ae67ddb36201b9bac5f8cea
+size 176911
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2009b439256e0c49432eef6c9c1e640f8fb3ff30
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d1e1f8329f64c16a10ba7829f89c2dff9a63bbe4ef95d223815ee2c7b8a7a9b
+size 85185
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb5a191e6f65180477a6de86ba248197a2548feb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:716557dd21fcd95cf8715e277d127a9a02dc22499ba9827855ac0dc1fdf42b12
+size 101249
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6e1282c4139ae81dfff4802e391a9c75b726acac
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:231595c5f57225962b7f1e0319e5f3307e6336e8b6606e5da9cb0bf32cb4665d
+size 118472
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b306ce3a2ad6ce019a7a1627daeebb80275463ff
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72ce1a0e1d558cdc8f74b4eeeaf08a99c3e2856c3befbf5fe712922ae1694d58
+size 135395
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..40f17f8f2081d51e5698ac37913e865000cb4e1a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c09d3cf88aa1ea7ace0ca8a8f59e65f0408d95bf03883410d7da2ecedd0a417
+size 151940
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7b03955cff78137f3f8a55928d0f65f372a34f49
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_choose_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:099e19d96ebb3886e45d1e388ef6ff6159978ca9a049bf12e4c4dc6ed9df25d5
+size 168747
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd77e66136e85287d5abfc8ec2308fedbeb0e2f6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cca9f94ce7e09113076b7570088544fff8562a3a651c713b363dd0a49c01d99
+size 96762
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0e05872dfc272c9792f52cf3063a82dcc5fec9cb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e35df899dd20bb7b87a117cd62f99f7d9aa6854a372c4ef655c1936003d342f
+size 118253
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..80b36bde9a6053c351661ab7a72de86a9e25e4bd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab35a22ffc8f9bb20753e1fc4fe065ebf0c5c684e56e74cd006d3a7ca15b09f1
+size 140933
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db426957420f2f1e77d380179e204275a548ffe0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eefffecfebc27021f0b0a476aaa0723c8fb2cd16d03511ab0c9800688339233
+size 163217
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ee5823f93094bd20d94a77e7e976b197c7fb482
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04f3474ed9e6aa5d4f843b2fea4f8bdf5a8b38208a4308cba5370da0bba9fb9f
+size 185186
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0da6486280ffbfa60f0a66da59de686c0f2a9330
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:081142fe35db94e02d835c7ecfc0d10798ba987973900d11ff0bfaea6822205f
+size 207312
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5605995bb75e9e82355b2cbe601135d1134fa73
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:589b38231fc2778448b09a38b5060d7384fe5a044730f3f1dd052c2df0996b65
+size 95777
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..661146b1a9d4eb619a64bc64a35b1792c27b4e9b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d27542d6a664c8a827986d088c5c59c96d20cfd1f182d823b9ba4b2ca4a2d436
+size 115634
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c47293f23965514c65f32ea6feea6cd4336a5b24
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df642852d54d3bc4486323d7138965d71e481398fd954e0acfe51ed3c53ce1bf
+size 136678
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5ccb18b5147e95aa1a644433c779b2b74409c1cf
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9be20b8132b7c9077b0f1185086acda9731174a50d13a6ed1f0d6fce752526
+size 157407
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60f82e0dbfaad7e26f13290188436e8866fb138a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4606d27600f889d3e0e5a4fbcb7ead9e1c8c8a92c76a64d70c3ca20ad27bafe1
+size 177739
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8eab1e5d089e616dc14487c9ae8c163a92ce6099
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_copa_plausible_alternatives_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d31e0e9fec974a7b3f27c8abb069b13d1d3b338826a504fc5ee8608366f915dc
+size 198351
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbaa7156287dc12d30ecd988d50b3fe7c427fb94
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2727a429344e3600948b6dcc89623d061d2f629b3239045f6433e324d182eb72
+size 3437511
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f9e02a11f04758d8cfe1b0c7ed51b474cbff4028
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3293a20ce8c08238a59771bee7ce5a6eefe29ab83b29f00354700e4a298b0a48
+size 3937994
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b5a6c33eb36c4dd7bd533d85fe111d85085ede1b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c41728d5bd65e53c988dd7acf956e14e027cd27840936b579070631ad9d6ac2
+size 4902359
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..32e08728e73526644587c114540de108ce0d911e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3e5c1e7c3bf009117e26b053d1879fdd4a54b4bc37ca416bd3460dd9d9bd0c0
+size 5843917
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d5439974410918362448f076acbb5bfbe8dc466d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5ff1ba589d7ae85a24c7eea665baad79ffa4de271678e9b121c03644f49cc85
+size 6781095
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d680d0d4aceb189bf03c9a19c0d55b0e35d99493
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:175ee8f766acef283aeff5a351a9df474a7a07266576acb8b9df29d986b558be
+size 7726430
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11f37900c39293b7f3ed65172dcc86d478dfc1e7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3aa12f7243e9c279c719a4cbdd285108f520a859b7bdf0ed1f3a8fccaa0c23fe
+size 3384531
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9fbc72310bf77bd5d66bf7e33de3d95fab6c64fd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4d4565106f7cae00510cc1a60f942f7bd1a92493bbf5fbe3dc7c03e342932da
+size 3862318
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c7a3d8fb394433a8879f06bf0038137b24eef816
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b690ebdcf97d9db5bcb85abada66a85d1d0da4c29b33b11e9849579d434940bc
+size 4788662
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f3364577df148adee7a02d8a74ce1af393f783e9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dab3a351768f093cbeb9f643ee76fab513804919b3221aeb482c93e03a697400
+size 5700576
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a9ea655d97abbe93790c8d3a92d809f43904ce3b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f7716c39028c013eb2f30ca6141ad28680197c1f40ad2d1b6f0b3f84486ecaf
+size 6610908
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..33ab2f1512dfb88c5e1e53da8a8eae72193f3e8e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fbed177c1160924194e6fa9acf0f5e677fda57849157abca8018c57848bedc6
+size 7527106
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..202aca4006cc1158448332e33079062fca8d9da2
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a17a7a22f79f2b94a83c1c8faf0546d544f666c792f51155b4b5ef8627e29097
+size 3963302
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9ee9d9ad55814ba375fef0d44e9aaedcf151fc1f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baf7c228dc8721ee855836a0655fea996cf9a7f4376cc21360346cec0729b9d3
+size 5047334
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..46a393e330a8dcfd6d86814c2f12b6b2dc949c0a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7da08e141bf433083cf1f318b6c75496c035a077f20072ff149c203b0b0622b
+size 6067983
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a5d78f8920c7d92cf6360ce960b5462af27208e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8a9b74d7fd85c6f085986ac19711ad49badcd700c9320a87139a487ed4661e2
+size 7086402
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9520b94a0238a5b101cc0c80a38f0d2e1f0e7eec
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25850b615fc1af1b2676e8126fd61e332a5dfdca35fd92d15786c50ee6702fe1
+size 16239172
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e5688baae0449f6db5a0c322ca62f25076389818
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cad4302350ac486359d55cd35c808867803e0ef7c548ccd5033e30d67655a64c
+size 9162968
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6e3959668ceaacbf1e3731e42450c8769ac24037
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff270b48c804041501ba60d83b29c180dfce3685abe17c9772f547a662de6eec
+size 3944198
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aca29101bba338affa75a0172b13fee7abb7be89
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a9df58f668d33f845187c8b467ff640adaf7b444b74300cc39fc616ef2217e5
+size 5000916
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..791bccbe63c719e622bc6250b96844cb4424e25e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99768a39a8a6a12f85da75febc8e02cc98ccb5d5c2d8a35656dfabb8eb8b2fd2
+size 6090893
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9d80b568178b8e698a0ac704f0ec51e7851b0741
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f83e924c64648f9c6288fc4f7061e58db49d479ca59f06d54820079d35dc9fe
+size 7175269
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a7f58759549625aeccbd1947dbed6d0a662d6f72
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4dade8cd1f02e6e985307ba42595b811d985144f923b74d3cc09bc093e25c97
+size 8255322
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cb52ec6229b06dddcc70dfabd76bb753e8e10ab5
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5e28046690cf0622f6d0265c63f47adceaddf25076b8f40559ed0401568c9d8
+size 9345481
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c2663afdd1152285a65e17ee3c90706282661ad4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5174dd6e9b045f49fe79efc9920b98e5e770f9435404ad93e9437c3daa2d24de
+size 2964844
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5110984bcaece7473e9a55a6e5d9e85857485a30
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b765c4800a8f6bd957fecbe0c11bd77f356ac71181b8ab4014d62d4c08523da9
+size 3459439
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a84dee3ea7e586546a810fbe55247ea2d639ac87
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efd8e92dca96e650bd7776e2c8cfe2376042d2a59206b114ed3b7d190f7e69c3
+size 4260426
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d19714a236679d78d9067d0691cb7607bb349c64
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41f79860a4873c03bda1ea9433da7f40f443b19fbd9b82a739caa9541bbf4a11
+size 5051138
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9b1257771c1f9b712ebcfe4528c35e232c0a09d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f9a322bb8f26e42d2ad858fc15d4b1282a3aefa847a6a711381b3d04bc51b5c
+size 5841422
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ce5174ab90d0ba4a209a866399b5082f1e34e46d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_e2e_nlg_cleaned_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c38c4f5f9697ffc9977b50f7f0412f1aa625c5ea2adceb861d7b6849e92cda2
+size 6638120
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2826b058536dfa145f4fcdec033b1538a97ba998
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27fc93b28f3be1644616d6c57a4e8d1420a45c667230f6c607dff182647572b3
+size 2894168
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4804799786c0d483d6e1ae6c6e78b24d7404e272
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1281400ace0bca885db2494d87925e1ce3f1ee4d2d47009ede51639cece1e90a
+size 5143410
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6e93867575061f817bb811e07c100a0a8b342002
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11dffe4c4cad16d2af031ece78718e4e22f2031069c693c1ce14c455e52b06c2
+size 7322248
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..636ea9f0a61f77c61ea8c8f6c5b57deb791172b3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebb00221fcb08169c93ef46f5ac3d6db70ec391f5c9106ad19ebfba786b0c250
+size 9603932
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7fa661175cf14ca149d53db293b47a5507c73ce9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6eea0a28625fcdfe2ec71564d9707feaed7b3a1d2248a5f18f64c33a08adb1b
+size 23567516
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f8bb6211e98ea44857c96b87871ba2871e127e38
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e95defae7d272e942974f91e112d518ccd157332d664caecf9a6f7af4ea2188
+size 14076739
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6896499062aa36859604e46e6608276342fbafa1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c6685943553a83e37d4fc3942395542f0dd58f202d85c063d8dc7a2883ee0b7
+size 2788212
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9a053259ee29879ddb84774b930a14dd20c31126
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc347803a31363439854b902fc52e11c48aa666c0cf332cc37718ef87feb98e0
+size 4978926
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6f1ee4d2fca158c91468e290f51a1111ef4c7c90
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d5732648eb1a30f04abe92dbbef22a5f1d6e89469d1e719e1ea7cbcf6fbc526
+size 7127113
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a2530ee5eb570ee13ffed761183440c846bef743
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4840a877c655016d320dad8c0cd58f7d58a1c6f6bc837803bbdd573f7492e05d
+size 9377209
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb419bd1ffe1d66e95eda3746eb4b69c40851c4b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00fa04c689432b950a69882e610eb18455e4117ecd72b1d459271079a8c193a3
+size 11534725
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e1d4c0699f6bd5370fa4314cb63a9c95ef23ac45
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_DOC_tldr_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09cbcd4560247d458d5dfd15692a1badbffa0cbc992747c70ce344e73a40ae36
+size 13794746
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a152169744e89db311cbe53b463fb5234accd19
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e191799de52dde4ce07428b6ee7c0964aa19b27e06bd77459066a8e2899e62be
+size 2812981
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..04ed9b0c8f8d9bafe41da7e6a99b97da10a7a73d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f45d2a95b895cd058c090093628d697bb2999bb036d7b3f0ba4f5c02f4408fc
+size 5024413
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..014bf60ca9c984582af1829a218b0fc11110ae0b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1281ac16802852ec6096f6b73a536c138d823a345a43d968b228761f1c5b393
+size 7198816
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..95ca09e9d80eab38f9fddb2727bbf2c020961029
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a0bf867fbab4d4efaff66c953fc5dca74b41b237dc05506663836935a109604
+size 9469307
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..08d6db8cc553167eb8d2c9f1330a993bd2a2c9da
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95c3bdf081426c6457c49a8707bf09ff09db10f2fee98e5fa969f73788187138
+size 11630628
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b97e70612945fb4f8909f083b693b8b819fefab7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ac3daba6c2dc2bed53cb85ae95c60748f43858a58e414842b1260c8f3172fc2
+size 13897553
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cfbf4efdbe6202ce1ab76f031b7ab3dc42a241ca
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:805eb7c6a3bf07ea51183b9fce04dddf2f5046393d4d8f9be7a1cff72b061462
+size 2823191
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c684b85cd930f74ce231fb87c0e09bebd7ecc6c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2266d9eaacf6801105a10ef4bd74b80f29539c559f9409bfa78391986d8fd780
+size 5050913
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..765bd56b1ebfaea001101dc64689aa2cbfb46b2a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee0a12c009e6cd0c006b1c1a2d53d95a01a6dc6d6018b9b8853b0a667306bce9
+size 7252800
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..89136076adf993d0ac9fdf26fd73218cfbd45e0b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee09e7d188d5e3881855fb5281a1eee1d7e13fa629a9814cefd5b7ebbe0eb7f0
+size 9500605
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..981431775696a92b1f55219f13a6a14c4adafa53
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fff14b2d865c7d626807cae63d1970a5c6bee63bd7c515409914e36a3190df0
+size 23253336
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00a3f09747146d42ac879c53812685a63713dd68
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_DOC_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddadf12af4d6081ee2cb5d4efbc7b638d15bd10f6b539ba32095eaf5619d0fc7
+size 13882366
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8fd03eb9463584915c9073b827a901c877738a85
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9dad392812a4712e9a9abbd7fa4f5e5773663027205ff38d182c5367e821a2a
+size 2873813
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6fcc98f6cbe908b01e6f2fd82d2eab86799d1e8
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14fdc153b02ee88c8ac192b5f0dd906206bc9659803f3a8b34c090aa66fd2d1d
+size 5071662
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6a8cde035f18a0aa340bb019c56fb9b2342f7404
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92de7dcbf6beda1c2104c7022cbab0c597fb3bde77de0fa3c546d57a3d5417f7
+size 7292717
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4a47845653a1c213d23d4708c8d9ca55b05ac6cb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a698a94b629c4a67853e7ebf85c129be1efd436503e04dd69ec523c7538efcb
+size 9569758
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c1ba040b8c406e9d5f47d48b6ef95f3fba48b6a0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:774bc58a2c470c39aec24172c5ccb657d738bb599123eff7419d26326176c66d
+size 23468914
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1c0f7d0e30d6f2f722735068aa0c3a6bf3ef984c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c83bd9271fa20e0264b5bd324510de4b45a572d6d625e1d5cbc262fe818784a
+size 14018627
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0ccbfc65be9bd213df6eec5f62701b6e60f0263
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea4ed524e928fc26e4102a807e44f72b3b22fc7828f75cccc801c4e25535380
+size 7207448
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14ac4202bb1c157ab9be98a7f05d5afc242ecc96
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ffd8ce20dfc70babe1554cfb1e3fbfe3d4e1a4d3fe11a433c02c9d76e698cb2
+size 2602447
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f3e65972ec2910f7a30af921ab7d804f9793677
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15eccbab6ab3ad08dece5cc6fc4975b8274457cd14b6da6c54154a504592075e
+size 3273873
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..59c214dcf659ab5d5974fc61c6f696a5763507ad
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c8654a111f93e71011ae3e9924de5c1daf929fe2b6a9efe4db65d88bb84d4ce
+size 3947504
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90ae307ca85dc2d3aa869a271d5d8065bbc706e9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7cdf6f816894668af4b2c78642d5c84f9ffed98c23483a7c3784bf7a492ca1b
+size 4642310
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bc2036f684073fe2432df5490851bd98d509d653
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_Correct-the-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beb661a58d5c1dce6e7a2495bd80066c809acd9cbad518c7d590e41985415f7a
+size 5324804
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e427b9bbe6ea9d86757449dfab4a4c6d029654de
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a099d46472ae4af0ac92ced0e47dd677f975dd43fecb1fc99aa6685c1be7d76
+size 2115935
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c482c3df9decd346f5fc00abccee2b98a02e3ee
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97bf18b2f511bd1d461a4f115da409b0aa8d1d71c9f6a47fc844359cf871a4c1
+size 2900909
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c8441c926533c3b93b2117c82fb0aa85174332b5
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8b4f4aaa52a0b176e8d8187fff5f52e9f1e86436daccb749ef93129bd7f0605
+size 3683911
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..774756eac1c161c358e25cdfa36e7cb6f9aebe87
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35b104e3b3612c3569b468cebe1807a475cf8f12952710b3597496019e2cdebe
+size 4458372
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d66b5d2f037ec1035a436e9c209ce638b1001c1f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2d2d956f3734e4800229a3ad682116fe0f368a569225b4192634f01716a14a0
+size 5254516
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a371393e834b961a1342e4f6963947fc8e76736
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe8c8fb6e565aa391ceaefe605285194618e2286adeb1cdeaba6cb0fa1134721
+size 6040677
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f9f55b0f471732b2a174fe9ad5a48636ae4c03f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90ba60fab9770165a5d0dc49ffba2682559f2d6b88ce4a8d60bd577351f99406
+size 3003817
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..718020382f2d153d9de97ef0cd8bdb1c712e3a0a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a36e9e297e026f348bd62d7e656ca988deca8ca87eaaf158862d4b26cbe57f75
+size 2314240
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6db529cd12805a3fb007eee6102544075e0c7199
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b2fe6742d0abb0d4154c8c939bbb564d60dc58425a8390fe600fd4ac39623fd
+size 1879157
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11fa3233977511984dcd471404df1aba7d53626a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf191c6ffb245fce76504054106dd315d434fd47cb6b0b496329ac289beead3
+size 2126347
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..af8436f414ff3cda9366e8ea04adf746d56c23bc
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745436bc3d4e260b67010ad132e85fedfd366a6053f7cab71bc141d1fc18bf5e
+size 2408401
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..82407cbff5bcaf6987cfb3a557bd097a0f21f64b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_no-prompt-needed_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5df8a5adcae9ffb04a42352bf01603ab5a3a98636ad9309f2d41dea46125b9ac
+size 2695124
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db38eeae18cb274507f443e92f90086d85bf6131
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fe6d1ec7b37bae5b3662ab78623e555447052a034a47a8948c3965986ff33ce
+size 1864129
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4898b11966cf16493d3a17316985d91e15d9176b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2511d6dfbb8cbc6b606e6cab9430b433c4bb1770fb98436615e09c1c3607642a
+size 2557203
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..202095d226d9305418de9d51a881e523b6164b41
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb91dc70b46d1e518d41416ad63bcce70de618d4074d4516bc484836243f1cd
+size 3248305
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b44dc616ca459e8bf3c41bc1a4da841485a0e72
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a07f26ce9d81a0b0c246b6ffbed1708dcb5759cf76c30457139e1ce0988f2106
+size 3930866
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5ae3604a1c00cbfa084195189c0f5312a9ebaf11
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:668af6f9725e7bbc0ecbb29b94ae14a515f648692c60e69f51756cd361068bd2
+size 4635110
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3e45bf1014725fabfd09bbca3c396882f0c19bfa
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_pick_correct_choice_index_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20e2f3b39c69cf6cb409add7f48ce1791a5efc58cdad8b27ea70b343bfb9b0b9
+size 5329371
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60881b924e8a26112601ae0c70c352d2be602ab6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b83e0642977368812d1caf09d732ca4f33825ba2225c9c9866c8398066958d66
+size 2257139
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0b575aaecc18091cb4f46c593fbcbd1b94ee6ec
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21c4ac83c68deb6d33ed333f4757805d3c9ae232a72d9c85aa06ef8ae91e6d2e
+size 3017956
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90501c47d98a38ad39cb8b1def4bac696ef210e2
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dba833a2bddffdeac7a0192b253acc03543a4a7c71231a761b45df16c3fba5d
+size 3774293
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab02cecf3e1cc9c4d6a356e7fcaf51907b59955b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bceda8b51d4f29d99b67d1c975a9f039736e2e39eb03eabd3a415297f2dd120c
+size 4519032
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64466d41fa2f9d9b196aea8fe725784f5a774125
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c46a81ab2b84a795b66a926ffa11652ccf48cb2a4943eed411c491306cc91a9
+size 5293868
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f896cc29fcd6d252c1072089d20b9dac3b880f9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_piqa_what_is_the_correct_ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:257f3d3dbde1b37fa8e908f1987b2360a3a43b60af8313c68264f23f8dc6efd2
+size 6055254
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1b08363e993ed2568ed13e39b16e7bd9534e4e32
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d5b2dc397863f580bf2da90888ee4fadb88e99f3d1ce21d72a9f826dfa0e8b8
+size 639903
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..44a96da14eb5e4d2638cd602fa325af82fb7b98f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63a2f4fb5674b39d3ec679367d2218c3931e0de00e16b454fc66a689e296b7bf
+size 755039
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a33b2c5f61f387a483552f852379cc22afd199cf
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bc8a9f50231c7dfd8afa29f2b89db650c23fdf765494b147b2b265364c0c425
+size 871246
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..82009b02c17b3fe5cb182ef21c54bd95f076916a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee61bf4a0bffdc8ed0e37d561a3d3686c9bc8fec26e405cb648056eec0dea04d
+size 985702
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9bda548195045f74070315d917439c1f542f672b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01842c2399b1e5ec15f48e8bb2cca76f3c7d627b5ba005748bd6a89ccac3dd3b
+size 1098430
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d31ea8fe63cc5bba71117d7a41bce58cc632e1ff
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e5c5b30f87856ff009516e69e093ecc0cccc69cb5dc32ec767a22f51c9967ae
+size 1213605
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f3913d380e3547843b048f3021aa1ae1421116cd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3e54c551526cf06391c4c7cf805badd0b12fe871ed684f39ea6673f752e6e0a
+size 1182482
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5aa4743b7c3ffc8517dc42da3e5be482a3877ee1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f956517a5d35e8fd46955368d570d41db98b79919a213e2097dc24bdba912b
+size 1779287
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..04d2a8d1b74a3988fd88817b61aee8bb992fee44
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f6710f7255ca0c32ae3209a33ac0c20b04f838cd5e54d2ba34bf518374fd4d1
+size 2388477
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fad6f8b4b451c89651c8918d86881ca11f876327
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0485269499f9e0e0d971bd83c5b362c2bf6605d152811d9e16be2cb1379a1aa5
+size 2973460
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..91b4b4464269423eb47fed7f1f787877a04cc4c3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76a7865e858572192de366b12b15f9aa7bc723a57a61066a99491d8b578b2bdb
+size 3555822
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..307596c003f70ad549fb41033373b06ef630ae48
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Direct-Question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b9aa9386a5c432dadd5e4d8d5acfcba33cac14d3d6cab3c79e28d4f9311beff
+size 4144625
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b17b9c6d350666c7ac37b3d63ecd2e9e89ae6dfb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9bd11f57f1805e1f3b7d310a47e83ada44c823b39d4910774438bbff2acb719
+size 1328975
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..18ad454f9dc78e43f46701269dcfdb825f6a5677
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2bd277cecaa883f8a438cc26e0911f78b174c339f9ca39f963dc04123d2188f
+size 1524492
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..98c17a6a3f1c2205f742cbb69cbae2fe742663f7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca6557b0e5a73ab742a591337ad671f393b1d97f8a9e646daf0198bcd5c2822e
+size 1720655
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2a76a21fddeeb7491466e2cd27d7e5a62b985803
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76962a1ab0ff0394af9f0c1a9ae762e1c12d8c45d57b68a7b0e1ecd2e87b85de
+size 1915741
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..76545f28d17e0293de104c15c2dff8f994666b34
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0640c3b827d4f5a690cf5bfd9b34e76cd52a3d6e76c73eb61fea606cad16da63
+size 2107414
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e0ca4d674212ba3a75390b8119fc8c9f0732e508
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75ee969d70e74c7f755e94d6d44a4e9d9319df4659ca60687c1f82dcd0d34714
+size 2300971
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9f6f21bfb89e3d1b520aab2a0b06859670c88ab
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55c74e82bf356cf282c893aa09ab67fa07970052aa249e58448f33e6224401cf
+size 1935091
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..13975635edc61b4127ec1ecd927e1bff971c0c77
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47bf47417aa83d725d51f7c8ee1349806dfbfd08afbd6e5991bace9428344f2c
+size 2636414
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db91bc456a5faa5603f0060016733e04b51395cd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c92b352f2e920c4e59d479823b1b37ae3975707146f307eb9bd17647a5acf49
+size 3349489
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a92f0650ffbb95bf43605188fccf08060607bee
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a53de8f20defff76c8b7f75edb2aa73d13441095ba8ee6ac3d8ddd62ccbc8f17
+size 4039125
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c047f968b41976aa255b9cb9cfe8f4895b15fd00
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50f488f25c20b6c217d034fa5556365a903177ba0b11ba43420e3022071382ab
+size 4724345
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db43081cfcd638807522d87fd60b995ba65c7c22
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c3ed865effd2f9048a787f85ac2b569d364b2649533b6026f1bbfec840a2b68
+size 5415598
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ed373d86e7aa2865d9051040c176e9ab835e4db
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d447c3e034c8d2cb6b7ce2f9fec4ed46841ba8a4f1f4a75aa050eb73f129807c
+size 1870134
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d060130514bdc5ce4e690f9587856658c2f696be
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4290b8daf453b2efe306b3ebca348dbab7d5842abc75748cb58fd3ac62e55256
+size 2545360
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9147c601b31d36529f7a58c19f37e94af0bb8c46
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a549f5515e013491f623d2bbef97198ed4fcd67f24f7c83f67eb1094d51019e
+size 3232551
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f33e87ddc269e63d0a24a2fb4410d9a7d1f377f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e98219fa9b4ffeae5ed551873f71244e24c5da3fee2e63ece5e296a8fe0298f3
+size 3896119
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..178d6fc926d859a0087d28f33bf5da8dc31da142
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:438cc1dc17e29c584d4b64caa972ca843b0c0d5dcdfb43fec032ae67a150d0b4
+size 4555424
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..952cbe2254d4904c98b95f0d9dd89e65c6e32506
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_sciq_Multiple-Choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c67d6993c15bad09bdadb4bb55508d7de770fbec04d3c1c072aec7dd8e0bbd30
+size 5220616
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eaa0f29fa349a98263486b141bd05c50cad6110e
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:829568b06ed7cbbc7f5eb71a38f080d6c4e1a3fcb9804791f849e462954ba115
+size 2213755
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..385437fee2935ad4d09b7fb7d99d50136b3b9573
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:736128d06b5673e3aa5f1a87e7542c0ede0b05dec3d6f73e46db707be3667a7f
+size 2975036
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a10b6505edfb9806ea0ad92f91dd92ef8ffde8c4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80b0626437755f2d0d7e85f2731736b1ee6d20087a83b6c099e912bac9f02b22
+size 3727212
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5506bb7a8956c015a5afd6164c2abb738c8aef1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de661f5218e100837fd239596242fd9761e604dbb5a0c5142ac496f7eafb1e7a
+size 4478871
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..521314c661677b51f8c7ed75cdf2a6ad63e94847
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcbc4b15e0e4c6f08d1222ceb94521fbcf3de1c077902db0b9f297c40632552c
+size 5230651
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba660dd24365585f4777a4b79d1aac458ae58d4f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e8549a39da5acd73c2d874b5ba8c11ded5a3f025c76b64a06d8471d1cc540b9
+size 5981210
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..670b78981e59f73caec4c0d326bca7bb50ca1ab1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a59a7ace9315e29bcd1f531c2b520ccdeedb27e5cc9584c089d946153726d92
+size 2361877
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e0584f346a4a56539fa7d5e7b9658fba0bc6cb86
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f036e9550790a31ef1b4fb5cf8b4d6fcce7f9502fd0008f02c67459b512123c
+size 3194258
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1fdd1f06b6102c55d6375a35c9446bdc259867ea
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8590241641a361fa556eba58e2fda4e208f5023e50a539a64e1e9ff42f12b3c5
+size 4017140
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fa31a61c4547605cd26019486d9091ac4c7c46be
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e785fc442d398835c23ca200dcf5d71db7527aac58de3dbae2190d2ee1243a28
+size 4839338
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f73af9cf96c3c8f14b3ebf8ce44bec2bd1b0471b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f6dd68229d54d23c64467b22c8c3c56359d6144ed21e7f844f4b0e28ab55d6
+size 5662408
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ddd72389cf8f12344f27b7baae298a050d3d2a1
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8464868dd6ba7915c9398fd46995973cddc337cdd5f021ea2a8c3032dba7de3c
+size 6484050
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d916cd2c2e7dc569b542ac69fedd2e799300a481
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e0b23993e10028d4ae81e74d19c717a008094635ab3a433ce7873465b4671d3
+size 1878825
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..29ff1173ef80df4e796fdd36ba63498b8744b3cf
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56520008130a39e61147fb838060f8ec2db23e816d1e9f50c25c1b5595ad943c
+size 2434754
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f97a01cee99115f260b6d903281a67ab1909995b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13b565ef218c6571a44e2d833b2bb8daf43f8229fae14424b4aedbcab715f5cb
+size 2985191
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d22c0c63cbc77c636eb417d84cd3acfe8bd7489
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02a2227cbc57f44965ddf82ae71d2d5ad988029a7d0a6921e4f1c0b40295809f
+size 3533654
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..42b78f95272289071985052e634888a941bb9e4f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21718f99d98f0c15b33c88019d34bbda7741c859149035778b266d3cc80e0e08
+size 4081948
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c35a02336d7705f1a176f37d51b6562c0a3c7e80
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Generate-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de1ac055d73ec166efeb1ca7fa6da7fddce602f6202fa60c88ca1ee8ba5a08ac
+size 4629125
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..61b6b59c5cfda1420353d901c110b16a419f295f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db1d8caf24ef56fa31f47e31b4b4437bcecf05aae8cc97a9e669a3d81acf19e2
+size 2371810
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9a0e072934c27abcb835f6b87d5d1e13a338cebc
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:099867d7648bd6323151eb4b61c6a5165762f65893b6af29ca9c83eeb17298da
+size 3210859
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d8b6ea7637227cfc143ea75dfcfdf3cc15436b9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:920e1a8439c72c0c19e10ea9f10b59b744c7fbaa16be45da902b83e62b56ff53
+size 4041504
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..58c418b632aed23f8f187212c59109629fbb693d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09989ae9a025782305c93e2b79e41742673683bcc432dd4bdae53b31c8d5954e
+size 4871640
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b0eb50cb7183d99934c59accfe92cbd0fa2c75a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5a56e6e4eb793068c8340e356ddbe4123d363d455baf1f1d8fae9492e8daba6
+size 5702050
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1e64f133c96ed740a8d1f2698b0b8c817515b3e0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3052782e5985ca07c100b56f8def7ee314760ed74973e92c329c990a68e16932
+size 6530976
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a913473207f92b0f2afe3319c7ce0f9937b34a27
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fe1744b228b2368a23435befb4d474a0270ad39fa65d1b72a6bc5a7f7ab8318
+size 2344844
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f56ae9ee2c538233bae45f855b2625dba67c42e7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00419336e4cc6e824bdd48c738413e0e39b74f7fab8f81b38704ae4fbde74370
+size 3158822
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d6ba02d6df512322651b6ff7f0a15e1cac468a10
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f31505175ffa8845dfc916a679a354eacc805f10a034801c06e034baf6d3f174
+size 3963194
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5079e5441b749ba877d181df423badd05e8411df
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f7167d34dd07e1ef463ec3c8716fedaebd186007f18b9918f73ba65c76563e5
+size 4766577
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c0aaac7d59c757202458d062a3978a84d8dba613
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77e4d7249f3677ef46382ba18ef644ae3811977385322e3781959476fae43d27
+size 5570941
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d7281fe51a8152b0919c9f59e97dd734296204df
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5c4e2a12b2bb4b02460963db64a760503215bdfda8a0e9a5e29ad32d7d365fa
+size 6373633
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6a0afbd56b042afd6179ab052c529eb0e304083f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cebe4d042b5d155720c234431e4eaa1be35fb47bc2af8c49c10c86705b8fbb97
+size 250482
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e21570425008ef19b0a52bb1443d694a22851f46
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4d7677db641302d4358bffce38f4f878aff4db85c75c1f6a7f7d570ecbc6547
+size 351343
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6eea5c44a3b75940cf589a8fc2d1c7a07addd308
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c0da01a453ea8c77f636f5692b86a2aacf5afae06d00bb503d0320e152ead6d
+size 449647
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f19b58cb2e56c40fff69f7f30ceca8e0a068fca
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7226f3a9bced6c62581dd201c185e7c7ca5cbecb79a545a3ada828a2d2aed4d3
+size 553010
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6578239c54314514e532b97b99964cde0699184
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3c382b44fee7102a50b060c86f9f5fff64c341bfb1cbc8a622354835ad70a7e
+size 651627
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06b355628c880dff61551035e06442293046b7c0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91d48468dbecd604bce1d66eb1f9d2b6b20fcdc2ec50c54bb0ca2949a4be6a3f
+size 747526
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97061724f76d444ff6d84b1bd5040ac0187236c2
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:291715ce6cf85153d9448786f62fd01e60beb13686878afb1129302c5730ccc6
+size 293139
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346
+size 415394
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..93eaec96429ae896604dfb8aa48b72f3456b67bc
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c863162d82c75d48d93188889254b9a01c31a5260947297d57390fb37f72df5
+size 535309
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..99fe9cb867d9e663fa4250b5ab0c56ed4d01c12b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cad9a28391d809dac28b643ab38751889015819376d7511f4b80e0cf986a4ac1
+size 660291
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da356c077f5ca5da19e5c80bf66a4684983c77c6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:886030c944f0ac33aa78c7afc2e0894992ec4a244fbee0102ee28b9b77d5bc18
+size 780480
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cab8dae36e259dd835b96388c3afa9a84c1a102
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f99f610ffc56ff854cb64b43f5ef73e65429649151e369607b5c9a677315aad0
+size 897970
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bfe34ff2d3639793ba38cd3f1263569ed5439f56
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63d66349d14bba4deb5941a2d2511e25ae7d73a938e580d543f0fed843c16d15
+size 258394
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9cbf1036aad0502985365fc3c1e4f6e61260a9fb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be846c83553992971191320dc0b1680fdb88d47b14588340466da89efe2c3044
+size 363042
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ac55b33efe35af3cf95639522685d56e3c4b9ee
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d13c39e25eca16eb6944d5a1eaa80d48b528b0ea5f8c0060df7d944e35e1eb2e
+size 465222
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8a0a0180cc84fe7ef5a95c4ee767dff34f3e13f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fcca1ede672d159b0ed8aee0de395c1d7190f461a5a010182538a8c75754c7f
+size 572455
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..061da6dc4a7ed63a9298e23de6b875a522815eb6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb5c14af3b97074257c544f1a168ca887099baa4886a76da87ec9104110433c4
+size 674928
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7d264fc70a1197e735156c4d62ad5d700fb5d7ab
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_does-it-follow-that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9845038939c7a987d3d617f89b1eb6ba6a43736ac0bbd8b24d99779138441090
+size 774686
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4088e960dd9cb1488a3b18c055d2ff2e8c686ce6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adbef7be7646bae66af451a65c7524a16f3ab629fb97ca78c60d508d2f2111b8
+size 261271
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d
+size 367750
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a5c16c25eb244d44e193783b3fa1a48a8001c572
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:115af82047a6aefe3924903781791cf88dd95d39ec74811644a2c02aca64e52a
+size 471876
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2a5bab04bc5432077eae45bb70f0329820ca0ee8
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f19abe26796db5a033b02d46b27ea8be9aa084c99008ab1e063a144dc026f05f
+size 581069
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e6e5f7ceb61896e76e02c91e7f69e9dbc57845fb
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3900132a0df16565ceab792c511ed946b865511018c96248e884bcb57b42dc3a
+size 685478
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4548059b36dc5eaa16162d3671048ea8f474d3bf
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_guaranteed-true_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3539404a0960d1c922f1936c41ba1a15e44c020ed5efa441cc632a7a3d19791f
+size 787174
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ddce3ad1d46344399863e56740df7d58e995e68b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:888ed6b0d423b83443790e454212d51edb199652df99b769efaf6a740da900e9
+size 262403
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f
+size 369689
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..46f0d383a1ca8faac1d83eaf3b312ec83b523fd7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c6b184e6fbffcf070f914ba6c9ddd1a4a2b4a79e64ef809c5d40efc631f958
+size 474666
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c1e8c4640313ee5054e3012f3a266f897bd73f32
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca8866e00ad0aa0f28829b1fec9e6bb3697f5699d83ea53d87b60a6d873028e6
+size 584690
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6246ff53d15845ec2c3cca18ff95319f93dedf44
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9ffabc06db11a7fbcff2d86f650b471fb560a503f5089efb43ec6ebcbdffa36
+size 689944
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2bb71b666f8b264aca54c0e92a2648663f3bc400
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_superglue_rte_should-assume_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c937ba210e3837c1ed022bfb84bb3479043ad1fed99098533c3686c18187c2ed
+size 792475
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c41aa2e02b2370274d38c7443056ecccea1b74d3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:398951e4d49ceac1433d3e8af66e06db9f64f7019d3e24b34c284a12dc4244cb
+size 1039031
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d458ab62ae38740720be41988367cdd8f8da6fd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e50ba2f4546858ac6524de24a127b0f74154a532342a1628891acd10ea9849f
+size 1300083
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b57e75909010fcdc86b8165df76a606affa8b81d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c6c6815c74c8d4ec49bf7a1f1a08697974659e77787f1636bc7c23731036f5d
+size 1561219
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..720fd0527bf934d8344ed82432feb1ed70d127c4
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fe693a5c78c0381c921ad070a754d70b4057f90c66884ca2fd4e5b78fdc4611
+size 1822687
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..434c1aa154e0ef545429606fe4919c72c40e80cd
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddf877681ebd27fbe094509537ffabf21371be6ffaf05896c1da46ff69d31623
+size 2082938
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f506b3f5990291d85b3267215813f54f11262ea0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_Replace_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24cc0351a923bd49b012db47eb63466e9e892bf3ad7e356e85d62a69b43afd98
+size 2343649
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e245f293005aad140bb18e66c2ed064098d75a7
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38a93b006d8627048357bf8f028bcccae590a0c17ec1e69108b542f48880ed6
+size 948111
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..29950866f8bec82231417e8fa6acf41d3faab025
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a339ee388b358aa8331520a557bfbfaed459659cb621c191c0a8518d20d2b129
+size 1180562
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..736fdde33b95ff3a45a885ea15f67f3f7102ea98
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a4cad43f8e9288da493f32da102620236c7907112bef49162ae0f389889e69b
+size 1413351
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae8c506df1c6d9f4384d749ddd2852f98c76da03
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a998259598041a6da52a1578947b12672a7ebff37b3af9a6c070d34507f53095
+size 1646368
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a6f5827e1085c363b64e622a41a0f4d89e228ec6
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a66d663dc80af167c37628b13e2c83315cf466dffac1d35a847e1d2fa265e667
+size 1878358
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..22ee881069deb71da235ac55478091c025240d7a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_True-or-False_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fbfcdbcf0ac2fda5fd524a7c07ca7d262d1db9ba3bbf05744c29a0b42e19b07
+size 2110488
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02e1c0e41e33a1d24492488370c0f3956e3eb345
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:857b1db8347bd0019004fcfa932e185be228b14daeabd6823be594d8c6fa0fdd
+size 1009915
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f5e7a00a74623d6c0a6a3d3ee1b700174f97742
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05f418745918dc17f680fed1e524193f3112133af5e5af78939c2175a6e0f4ef
+size 1243177
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f4ab50bddd559cfe887d0190f810dbb5522a117d
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d88f4962936db1458b91d7ebd5c176b33d813ef2282320e8bcb49039f73078e1
+size 1476438
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..597e1390b329b0cb0375b381fa83d7f5bf2b7624
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c996cd06a1fea22b377a0614fd26136d4da65cc6a06d34f4b36a02070180c764
+size 1710021
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..555614ed2ba1b1153b3dd590a7cfda0569159d82
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:416bae1ddf9091cd92dd5d1573eb95b7fef907924754c9f1e376dbed2a81ac44
+size 1942393
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2bcd8c22aa79292e4469cf09d7dec2070f175ef0
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_does-underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53d4f11a622149efdbc1b0d8519a29f12cfecafac13d81d67648c77617edfb9f
+size 2175219
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bf2f404c2a1ad627ca9fa8517d0708ec4231517f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7abba360b7095a3f8ec141ef868c623cdabb123955eb4333d27d2ae91787650b
+size 969393
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9809aa0209792b5d2c09ebe279d74a3a60b01b8f
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90c2f0b68e293c3dc0db4185f42db17d353836e5ec040dc8e388e4326300f68b
+size 1205158
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b33267a8937c184f657f4aaa5d0ad2eec17d1947
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cb56a91dad89e9ebff755f9af195075587d2a28f19960786c4dad533cd3b2f8
+size 1440957
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..855781177278a6f3d2812d8d73bc4749b4ed6212
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb5cf458564aa1a7f272d52b45044ee616f77c5260b4d90cc1cbd113a698443a
+size 1677080
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a3b1255752b97877a831b844a7a2fb18d99eabf3
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7920bdd8b5ad4d5ee30a65b22eb71c2f3648b72d40ba964df3491664292d7a23
+size 1912050
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..36a4552be63bd09803d14eb1994205ef8b527022
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_stand-for_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:014172f4b0f03d835a5dd74182b085bb0b321a896df94d5d7abb0d1164ddf176
+size 2147430
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_0.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..528846050bc458a5132aa38466f7e0f30860ae95
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b256fe47f27b3676df5e92e693e662be6c0c9eeb9944f40a1f8c9736ddcbb725
+size 1016324
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_1.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e64fd29ade63448a2d504e53629f229214a8607b
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff4e2fb1c963c4bcdc41f15c00437113efdfef0d84466310a6207d4a8c611d5
+size 1257131
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_2.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b83aac934938c6504d884fb3708df6207a8e921c
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfcb5b40a54dfa5c2225e9f940e0e9aa1e5025f27458f6e8477d142ed6dadf27
+size 1497925
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_3.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..439f5dbfc1999fddee5706c1df58b1740d082dab
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bebde2f20cc0fe4fe8834901d6498080301cb833b53ef146000166e8f53c9f4
+size 1739066
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_4.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..274c4de41565f50bce3d353d288faf2148018534
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c21fd0a54fc5834722e9af30961b6c53a0aa53ac0fa2f452cd700d97d76b79
+size 1979058
diff --git a/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_5.jsonl b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a4c4a87fd4f942dc067b0d3cd7c4f894bddc6e3a
--- /dev/null
+++ b/4b284b21boscar/eval/examples.4b284b21boscar_winogrande_underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7aaf42d0bdac2eefbb697ab129e317420982b266439cbcc3a6c0be536638ef36
+size 2219501
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2b3dc60dc4c8acb7427a70c6f98a3fa157f8cf7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.3119407628991087,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.02839548709754242
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.07438946576288918,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0025513647065143596
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.2632468762241862,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004868249221352049
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.0990806449013871,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021415843483912466
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.03312917849091313,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014174063689487894
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.12235027642168965,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0031144296943736486
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.04529204402030828,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001254298305442429
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.07117230529834999,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00236804160887153
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.25570352079335334,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004721463101193061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.09534482558563832,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019930933840825046
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.07060394534578683,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0023788303493259178
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.25086964131360756,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00462453643406655
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.09431820077676166,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020087380246042903
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ea12de9c5d31fad4ab5e2b83caa95bb54cb4fa2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5375007952609668,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.027971198631201743
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1520773825012676,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004748290705934928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3227263264666382,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004944101731638509
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.16868686014694956,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0037497452245084024
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.07832284618044694,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0031613171494569007
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1641704139450893,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003490737203218088
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.08499563989738917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002483035814872989
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.13745282575305529,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004251980584200159
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3019875028781424,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004563649108443115
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1534335691189539,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0032520482964081736
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.13987460558088444,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00433835517591582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3044600652888652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004600132970650535
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.15571056260264077,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0033278332133086664
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..170f88a8a61391729e2728e3f6da663448f87836
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.7869498474646944,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04640396849840582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.19796396527381735,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005701278438899864
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3706483303216963,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0050173611899764405
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2084575980044784,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004415820801430582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10467370587595308,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003819065495086435
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19711908309071768,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0038874252766392605
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10954670655177427,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0031308589770994762
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17592108046935254,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005006735502669654
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3446684722385714,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004645081391462266
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18766755781965408,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038435725948238425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.18060870457183115,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005158605822048483
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3494009114809856,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004713912149914969
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19187903453194924,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003958689064769724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d869466e5aee69b172a949faf832d41610d70f1
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.8781645295094662,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.023295523644902574
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.20253583048997179,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005667155356549766
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.39458836927395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00508328824891432
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.22142348005369172,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004576657734814378
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11025140845494157,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003876483056835859
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21020652499163808,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003888206554440605
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11757388147146428,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003185839232819997
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17821674790607994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004906399590419944
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3645388282549846,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004646105275227094
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19753768298479737,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003907531687545351
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.18364695968481887,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005076299243812921
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3699350432753926,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004721581387764664
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.20241133896176328,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004042906289046454
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b6425325f4625e4d254a6f53d5099a95e0af071
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9900707197163517,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.031011845880251587
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.21624013657641925,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0057958787897655845
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.4006689051971167,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005025377089358478
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.231391578826814,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004602731955828627
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11741133357899738,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0038707531452077852
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21636504595775147,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003993049382207552
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.12321787830184827,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0031814702327693906
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1866671922061064,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004886171018823189
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3671506160865042,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004552611209109354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.20337113659463418,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003830528444749543
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.19479702587195674,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00516733338200102
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3738824304063063,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004627178059919308
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.2102687170624152,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0040377255274080814
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cf985263a7144ba482cb4d253771f4e0c63816c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.1661469259713872,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05442011585382653
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.23542620246457463,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006120612746647669
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.4126382803733479,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004969059062845303
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2462703929154201,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004784081954132341
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.13414542688077327,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004317950948756051
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.22644337464507955,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00400180902077526
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.13531236031865498,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003419155242253958
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.20531156621874325,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005277463604539717
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3773305254767969,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004505345415815039
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.21699579683860706,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004030577049307887
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.21360232440350232,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005527478664092985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.38494058675880927,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004590859610063705
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.22427477226716616,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004231158968354307
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..50021d6ecc110d0623409f1ce572cc99b9349b60
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.041346373718141455,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0010599623460339127
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.25211310702802325,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002788455474554402
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.06377388411846296,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010349413916477116
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.002839485845793308,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00025451562903735196
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.020610793870894942,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015955007557906615
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.004534425826606448,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003879492990087019
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.03934308508221285,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0008489690182718232
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.2494389106117795,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027727843463422105
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.06183696371033167,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0009056912304361808
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.032285247594853815,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0009220039700099138
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.19883286423228544,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002350462700574554
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.049196335508131475,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0008582052072391212
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 0.01632350783277848,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.002358717732716759
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d08ad47895673003108cd10d3ef3c638a164b40
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.44445132729923204,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006526455541829097
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.3889450416416867,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005166406539461182
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.35911029870880495,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004787852480889677
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.21846220594720275,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005005245010753829
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.18846751093405656,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004061844071462999
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.17232524259944224,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0037325291663246048
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.36432134727711496,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005635577543148762
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.3229414171436671,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004531449966392583
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.2930536919182777,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004002123970091481
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.3866016578521912,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0058725916797575575
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.3373116717187921,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004582899451391632
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.3097862182227815,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00415316763055169
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 5.568248943628284,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.15936131698908332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa722818e8e0ca34e49181860246711fe6845a1a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.5921712622610545,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006080920495444881
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.5256791287055436,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004787960765756497
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.501627380148766,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004626333035685351
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.344221483181946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005178410791483312
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.30078742559239424,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004299159471007747
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.2868389890412573,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004120089080937723
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.4827491624545944,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005485212821791447
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.43467949164870573,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004513595168941902
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.40933775812934475,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004227695419911034
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5175873252849847,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005679308606153899
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4594793391268419,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045209528301636695
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.43681937401317794,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0042947259171227025
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 9.327080848315214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33074776428543545
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a09b7567e7434ef76ec10937b99465ac3550fceb
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6196402629179393,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005522213166681634
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.5364136875813985,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004681515049484617
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5296729046946752,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004241547746449502
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3662989314092075,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00496675265113463
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.31326806717833167,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004336125727613282
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.30821667509197176,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0040240686439643155
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5062917357965477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00508050519245006
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.44263683330106596,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004460129545645125
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.4331671556081239,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003993476175551676
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5432394466733573,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0052393692520715275
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.47055228809438604,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004443920194380233
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4628894235546133,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004002328864605658
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 12.36334348883012,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5943366673610281
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..efaf5c5066a06683a07c34b4733aa9399f95a0f9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6365786298113485,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005343609526718168
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.535103469511539,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004748804702692128
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5402829312776871,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004197339364184242
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.38031962461269936,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004916807569648106
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.3171258246836024,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004430877750976238
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.31819759490776656,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00401669725008511
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5204631665257551,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005057831046431135
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4411323450746604,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004524079062455917
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.4417213853690807,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004018127749079752
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.557753905383597,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005156190226040093
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4677559934260435,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045047882754511446
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.471525411066689,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003997505234098393
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 13.83666464772769,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5924556563703853
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3639dc900051fb3191566e1f2c3faa66de27d2de
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6510118389549906,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005033868263865303
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.5398764737436825,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004749394572676228
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5504705447467583,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003931470606428917
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3895468651617418,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004836511207740123
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.3218179502144929,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004358671474235282
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.32523157643827644,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003905427714337188
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5320306794072402,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004844722059279852
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4446621595631789,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0045022387221370155
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.44940514442893853,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003799590605940088
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5716550921263407,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004948958293419547
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4718974866236728,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004467252768544588
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4802802956854494,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003755409047032572
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 16.027089803530085,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.4299479515875188
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2915c78af9373cfe3b46340b84bb0e9c001c9c42
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 0.497657873618323,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.012349325099936713
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.04224352625871548,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0010010170277480635
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.2907403847735972,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004512637085950553
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.07015638110137974,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015061107866413254
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.011843871110726717,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0004944309477955418
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.07756192597479347,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003065548774425208
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.019528844760768004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007823549661191675
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.03916091211293898,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0008415465548236258
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.2777346312680996,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00411812624577053
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.06537354397270882,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012792117359157506
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.03409147439298827,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0008881130271259737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.23208716933748721,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004219597703853115
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.05643295459659358,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013478369359460178
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a54d2fe82c1fef673bc0c1b7c95e5395b7901dfa
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 7.427390130114957,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.43368679373129737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.5191085718453924,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0062926152585314095
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4305337819646001,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005069927095860342
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.4118960520453098,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004628613219555022
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.27355769670764424,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00502844636364607
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.22569201243104556,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004180857852458944
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.213133329600957,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038153064397342023
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.42309183690103064,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005570286910843408
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.3541307751631573,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00452062225618094
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.33429534379484965,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004021868671783561
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.4514869427389182,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005793363430064205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.37291654050475104,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004610348051058544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.3549292721389946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004123801774630897
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b531d7a08e96679f107548aa07d6d94fbc8374e5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 11.690802198271522,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.31962481261541265
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.60938885297845,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005736762572015516
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4910912194202063,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004863683461648982
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.4931339577050635,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004313965951232142
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.3538438266464095,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005179933434688372
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.279763244723918,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004204372995712399
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.2805748499547808,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003992048922184352
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5022619497278473,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005313489957991849
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.40736431997199174,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004488232749862428
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.40541177161378195,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003964253611695142
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5338817231626088,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005446991150797629
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.42899667780520745,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004529150486034353
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.42949056532886654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004009506107488504
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d21fd00ca6681d23258b5c9413d2619cb6347536
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.015729199684401,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2861398682078767
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6338330153613686,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005504406796192468
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.5032857728045119,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048083765697394315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5162716716149718,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004182805028604736
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.3755638462897083,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005104018049016375
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.29412280640119604,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00434667917909
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.30061442097006785,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004053287242140008
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5245847967636375,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005180633927285279
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.41888362437737175,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004531219879037914
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.42681373245384246,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003970750290746653
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5585561781062417,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005301431671802071
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.44153744457573163,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004518919451629006
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4522158954741629,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003949519994979285
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cbae0145c0d71ec2ca6ae8773d20715ff982dc6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 15.057505679492566,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.19539023179986367
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6455851098568031,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005274725748277269
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.5097818550726657,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004854689261667029
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5273437410664512,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004079973205520906
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.384680042250048,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005024147317886237
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.30024070682614185,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004425376116803175
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.30881180787788937,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004024262874364813
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5326343859410355,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005055378455270224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.4236581068981246,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004613640724561802
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.434864604939784,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003935277234180222
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5687274941110855,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005165852198122328
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4465950275423008,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004587981631618588
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4615325857376345,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00390879554622888
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..678a616b5fec1b2b3175274091e74625283cc511
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.913912914173546,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.339827970669822
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6523868918453206,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005062625589297553
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.5085753634275134,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004805104401460468
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5325855516384019,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004001597007402866
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.39050183995250265,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00492489283628371
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.30262983728520415,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004346922166045064
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.31441853558362365,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003975943025845374
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5386923972401838,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0048732094440267345
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.42372630591396704,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004547327839305676
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.44000229896478804,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003849076820560861
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5754652293985637,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004984504054500151
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4472244706278206,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004550869580659433
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4675136302012598,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038498013228786825
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fec5ff3f1001cbfd486698bfd0b1fd77da198261
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.0314881766646196,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015126365359450058
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.16236274800989733,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004098228994255239
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.045274028182706996,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001450220774744457
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.006796590729668326,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0004941842666750881
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.04018995353352481,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.002384955095113897
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.01098605507786806,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007511759228558155
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.029272367859116636,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0014275614818918265
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.1558380415065589,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.003946147985654718
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.042154554721966754,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013001281158823512
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.02834368778173644,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014346203472812527
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.14391153007861998,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003595338406210201
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.04020710158810906,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0012878319091503328
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 0.1125579135927861,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.013714020504806836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5319a4cad893b8ab1ac1042dbadc9bea5904c6ee
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.4453173150800215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005867312457939903
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.43803748852474483,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005340184338954727
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.38692686155050754,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004548113492644709
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.21705422098748592,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004510213787992247
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2171325390790844,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004282000275127611
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.18729231846618402,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0036584106265992998
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.3651358482383703,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005118474245656637
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.3621730834168619,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004648257519661906
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.315850989957773,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003848944382861772
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.38828248191877074,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00533391149324851
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.3815163353147994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004780745611029048
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.3352406893630061,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0040092153733560084
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 5.608709262619956,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2845344912673943
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..791f128fafc5fca2cd611306c4007ab72b4fbd41
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.594075097469876,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005735614328570698
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.5189486365321683,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004964847282332437
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5005214985653731,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004384268530846718
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.33511234833300463,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004901792389818218
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2910349230767627,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0043654656293511465
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.27862716831146983,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003948265404921788
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.48216021689966443,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005175532998377631
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4271540769347934,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004639021202183366
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.40637383732751037,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003976359495955301
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5163251014695897,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005331549511342829
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.450327008993731,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004620001309284672
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4328365405216621,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004021762465773053
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 10.191385722066029,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5616979090322316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..96f8300e6fc52542aa6a448983aa78a4a69a395a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6422175208003964,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005158556091166668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.5220235781715165,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048808827264682445
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5327228493532781,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0040829606282401934
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3710515262643343,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0048873929905967495
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.301460611554934,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004401012474144791
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.30446134771742095,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004015625700663121
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5224004508992472,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004880222498755768
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4277850121139605,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004564467034606729
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.43314386481790423,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038756825295561946
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5597535897029906,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004958294292183076
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4537322290003083,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004551965114749363
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.46163771703476303,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038162152037734846
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 14.527790269592327,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.4446902271294265
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f391caee338dee024091eb678f04c9cd3902908
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6575431547631965,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0049491040890518685
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.525349510502271,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004884464622105695
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5434372041612334,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003978622662539026
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3840813340290783,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004787008650855236
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.30651040101987426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004431179075251621
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.31365258819037656,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003924794819481102
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5336789314225935,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00480848453447696
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4285556303170423,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0045947492289961775
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.43987603799527164,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038197961382302934
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5711707553995303,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004829283144054192
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4549215035973506,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045583348223674925
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4695951533979971,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037684239929364916
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 14.718574394473302,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2197487642536066
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..27eca98f5c4d1ac0b8b73a7fee5ee81839b81047
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6733576969554074,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004783096353879735
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.5251265497060836,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0049951468314383225
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.549668413334997,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038761839348562285
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3989223056863438,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004801014921009065
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.3096200378794593,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004477481811131
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.3207540410972429,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003903119960695521
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5476435195138217,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004678577775970766
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4306118288590538,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004703335999269713
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.44712303244722945,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037795088969010803
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.586053500249237,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004722958800399445
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.456069027426347,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00467456255925281
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.47620688306939846,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036948499162037154
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 14.665585017062076,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.24236429155044314
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed929fae9fc62bf987213c2a737637dbc252e379
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.06843022196095826,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016209829348703792
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.3693465742479335,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00572904770734739
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.10846874891657471,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022660858526442344
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.021934695212110132,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008587513806860508
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.12551180235367884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003857834886276161
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.03520534486773339,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012605430557977252
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.0563802824192856,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012612650769465242
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.3137283634535684,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0047883163685496725
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.08984155494834113,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017851961263540854
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.0599262486131966,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014112607677852654
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.3288265886334672,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.005158374848086975
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.09510390510913437,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00197951518364349
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 0.5771668227229001,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03949018213229336
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..77b442b36ad7ec4d8e3f6972aa29fb6cf2474c2b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.44750599283357245,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0061750153120256656
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5030371140620634,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005057579814946662
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.40672974088724956,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004747719503261412
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.23121201106352332,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004554395505359171
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.26095223293350595,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004203049627414673
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.20794763136088765,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003717449517818062
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.36379932365992146,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005316173565881487
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4154788393144536,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004521009665809461
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.32968419760104106,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004005395669312728
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.3878588296056201,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005564041350161236
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4364619953127603,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004639975004724414
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.350222658819573,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00419318437885468
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 4.943613794712205,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11844944764051875
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..915a8407aa19533ba0c50cee554e10847c87dfbc
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.5520188411993677,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00601214288474237
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.548220949758813,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004819342758931737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.49290093244985994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004535226637927198
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3120875667889668,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004983873820968777
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.3067736586377568,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004384823596918038
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.2745871334738363,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004011289790613376
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.44944454565402747,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005387909108481333
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4525511017399319,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0045363314626869445
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4014212538061013,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004110491966627668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.4811824027219158,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005594588861653996
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.47956221956314604,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045847302918650785
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4284958019884404,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004213919043895169
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 8.417964438312604,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2643792221022832
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec07fa9b2c30f9d66cb7458415ee0bcc754224a1
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.5928085610957852,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005517450633254952
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5525764549057455,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004701547158523795
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5252035299000158,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0042492415448978004
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3371412147682034,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00476635918343834
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.31720395729691514,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004415324262802716
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.29693925227774526,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003968509948406998
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.474222482083209,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005000455875766691
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4498982913693952,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004517684638474564
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4214251230467327,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003946378140362295
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5159206623402781,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005204009404461667
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4820510087427771,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004506991367679515
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.45579454499026173,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0039966392224786975
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 11.337441125938529,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.35281351837475067
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..27ca8266c5de79e4915543938bd9d0224e61d927
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6179161705510887,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005179925027460375
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5499793848410689,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004719898662325513
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5392200875202074,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004029829765978907
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.35489114840169056,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004672719377311766
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.31710389747209167,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0043486372666120005
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.30727811276404843,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038965857969858637
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.4928834334072748,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004801941083552381
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.44439662825960463,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004493778056439419
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4305982476427426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038132986252883195
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5355960987340154,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004945683254307556
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.47715356656187047,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004460972059827477
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.46597541865449993,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038235033593005787
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 13.517168068549186,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.46447484153448027
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e5d1dfe25c6d78a0397d69c9191316dac6f0362
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6306450054131605,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00501971293696801
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5530712542241596,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004764483537044356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.547936351735332,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0039340919320462716
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3660341167515341,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004644221137903947
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.32179748545535564,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004381948513132989
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.3143304096356874,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003788714030250737
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5058246261062164,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004683812001690324
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4489733936754188,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004527883655130444
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4395476708412179,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037047300276620886
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5476387430812724,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0048323373547494745
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4809209755093611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004512527660709306
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4743263285062845,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037242713323822315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 14.620474736449363,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.7629902022298345
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0256650f1d745dc8e4dd17910a4ca07e96a8746
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.10652121167086522,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0023688690576375666
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.18229124220903334,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0036055900199667965
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.12325351760686601,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024378361146446775
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.02403122038179163,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007699060973631696
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.043625038488324976,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001479846592084219
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.028234093188988292,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008538683596415171
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.07502743287846425,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016790640765481034
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.13332814861106884,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027047012209831054
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.08741178424542367,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016840514722071915
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.09866958650856938,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002213777292688482
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.16898386767188542,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003353535963349856
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.11405814566073934,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022600674454096924
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 1.8920645866194166,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.10906475215438381
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..90a3891d83064510c31b2f3932a64875d832ec04
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.20625362062713593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002872702399736292
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.2783423182358244,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003090804568475943
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.2065151225097691,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00213658012540372
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.05273686788331137,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015443615818967447
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.06918059640276301,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00164036782101426
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.05042504357023753,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010841844819624794
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.1506871830070282,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002247453578198224
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.20472914353418092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023795235115801183
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1494723652733573,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015152057849824208
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.19197325780481556,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002708881855569751
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2588957282283766,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028837734118927188
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.19187799581291262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019884841459792014
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.6453615405491733,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08715117333663348
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebf1834fca65e7b52ca7b4e359740cf5f6f648e8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.2604985230342208,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003532579292360877
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.27042057106417866,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029893664849675794
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.2215153917570656,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021128026916489443
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.07505187378193366,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002143898427448883
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.071229761133421,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016357845818527237
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.05844382108884806,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012190765682804957
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.19802942305185472,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029629197765882512
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.20201919148745845,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002320204372933796
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1647093031611208,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015810334646662381
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.24329711860986517,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033460280224748254
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.25211394848515684,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027986441657024356
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.20635319833287152,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001971460412273462
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 3.443834214575318,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07662644696644708
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9feede3cca34b85c28cb98762a269d600386c8d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.24793362733863641,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004072705116113129
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.21602357804646766,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003247971160753683
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.18909911450688263,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002485261851347391
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.0757189416385469,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002438351512209196
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.05889179745482193,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015809058259419952
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.052264080690175074,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001305053336431743
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.19350920887510484,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034516414937821207
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.16335220863904687,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002527136592321546
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.14342019954697277,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019335669853057015
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.23336686434028253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0039000769946596603
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.20188401739948075,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003045495397045043
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.17698278189160158,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023368019527441448
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.9320240355527942,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06400188830946264
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..97fb56b22c346b9185bc91c9a5683e232c2084dc
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.08505455985583703,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032686912222570355
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.06784754434173991,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002609991683396609
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.06018981225571402,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021308767674926973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.02558603416085643,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016206620665113888
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.018640033822395298,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010708114501748317
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.016389358943220907,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008517841588473893
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.06804946746366528,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027433941383577104
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.0519878061323653,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002012096563564017
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.046457274721451496,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001657068676571334
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.07991322250629485,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003104723745220196
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.0627062428611556,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024026682988274907
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.056048626197687586,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001990429951365515
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 0.10422757463369249,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.016913708236514044
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c953bc572d7a7681649bfaec2ccb8aadb4a27174
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.014899886343061936,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001636426713859285
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.010011528241871159,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.001061768890010487
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.009433687410752581,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.000950731356497763
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.004817270390027373,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008233657770553777
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.002766503050071671,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003904045601421101
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.002816474216950762,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004045724170735749
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.012510249874248657,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0014618089207318412
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.007744934006324679,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0008222562130519892
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.007435953857664836,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007624386481813805
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.014274933245445069,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015867990609173925
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.009433722001459076,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001003734017369036
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.008924051993279348,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009024291145168211
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.1661355655826006e-13,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 2.450117888401184e-12
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7137c4fbb8cfad5b7a1f418f5b690855389f870a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.07027100884274905,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0014521445356974571
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.10472445052112465,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0020798618023743585
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.07708183394727501,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001463347467260908
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.008562832254231845,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0004136142831151558
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.013863303125330673,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.000729102085052192
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.009678964571703324,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00045902080520941383
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.06026932981633865,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011386395293241382
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.09167546245478224,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0017633863007209104
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.06658903938989344,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011726928962648847
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.0657271406620227,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0013418331595434927
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.09851222709720396,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001953574801462124
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.07222873567668915,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001359766582637702
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.505726128252773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0470776753818845
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e07b3713a2f9a03b82f73fbf3fe3abf432c377ff
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.11632215837726943,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0022719726698542486
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.1089073980683278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0018643903712033862
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.09649976388611473,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0014734065656729948
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.011877583800252254,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009921470208561732
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.009499474155946874,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00063547866730146
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.00821435177714014,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004985188271972035
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.09352683015219185,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0019241784332484945
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.08657411410201665,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001449213254847429
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.07633654428148366,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011106535993015147
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.11156528958872643,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002174780318855094
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.10428854893025001,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017563264576441672
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.09246968921921218,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013963398060902452
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.7067381410576549,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.050068101131539915
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f87011093dacc92e52de7dbf7e55be490018bc4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.16600399585233996,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003955264411797659
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.13361835608132602,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002823518494999984
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.11656166289615656,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022320874454571445
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.04755537268642802,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0023999815353559694
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.03077587271042821,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011932156140592707
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.027031284270892082,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009748921310219831
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.1375576289609542,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034657367963040514
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.10787166436393111,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002274225028197049
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.09362676385446607,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017594678125178764
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.15740400069202692,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038134600407906997
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.1251306649415424,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026365736489900736
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.10952081998972399,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002101849995710418
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 1.556112040416221,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08795854619057293
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..45a63a9a1ac990299734962992d2ba8f7f531416
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.19744399731543294,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00445562488490846
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.13957366973269725,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029942900998473883
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.12759653021086045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002413682977750019
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.05894645060462109,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025387661262517916
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.03675690679659171,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013469211911312032
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.03325713135121395,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001088176891614555
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.16313034967971393,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0038938090385992356
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.11198595815037796,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024095154984473616
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.10204884478099904,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019050000519439114
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.18602083735680533,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004280440688070132
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.12966848343967344,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027716800385877054
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.1189102904059155,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002250535740232956
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 1.7264483443297305,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05497724685084595
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d02f0a9984308ca3dc159c73d901fe74415e806
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.0759239139617649,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033259656106442337
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.04900609435825128,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00217479262759098
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.045804729083272995,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018398695267236884
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.02337045099123398,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018175381983276736
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.01330250450032572,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0009108445730813351
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.01233554242069137,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007539785982029479
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.06433885009869143,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002943755350067257
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.039872298371084615,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001773782906076355
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.03730234130304816,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001486354806413072
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.07182753236028966,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031894464950767594
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.04564084123760375,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00201563203869237
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.04272052087689551,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001706024011899182
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.03631236847231814,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.005558768912443654
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a8adab74b912f7823549c1695ae3189b852759e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.012958829476217265,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015012434227656753
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.008235611123812514,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.000978389330203237
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.00781487832891677,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008441904204964693
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.004843771947683436,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009481338099913783
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.00236491801694592,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00045623828249607127
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.002188555672921467,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003407015183673761
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.010904268951568518,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013380932808541883
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.006477885900035703,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00078133822629954
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.006226776578720049,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006793362081572629
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.012233303392934904,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014413364413859053
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.007708053571943523,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0009228587153200308
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.007267764694797319,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007815243584570973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 4.269762783929057e-16,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 3.518338645971512e-14
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..92c05a45587bcc610aa13bff3020e2371a9a64e8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.07075481728468239,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001896345738729238
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.06643817307325445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0017106553038965962
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.06017877876012817,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013969700713652288
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.008567992055791158,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0004658658723066838
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.008438860890352569,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005506053125992731
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.0073466534048262444,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003938081639864923
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.0616207631889302,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016585682800579703
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.05781173371355003,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0014446828709987923
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.052169980061393784,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00115977640796648
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.06858606377672731,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018515695822560211
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.06374381567280066,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0016154876653642737
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.058015717752569224,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013371683684572618
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.30166616125136464,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03299092473107825
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9a3897c17d15ce9e4fcd103fbdb8f221d2a3453
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.1371808934348306,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002439319230759517
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.12109561626523216,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.001793187416049064
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.10986107432965571,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0014492605599785123
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.014358452712872411,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009985415125916485
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.010448529454438157,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006381428033098405
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.009793539257262535,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005566732491126781
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.1105937798064453,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0020150560406063285
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.09669600580268385,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0013796211767045082
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.0873337413737127,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0010847480607585381
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.13197826678166255,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0023452181446837026
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.11668487016999766,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017061887616042837
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.10574341756058646,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013795771170144668
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.7295859274519257,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05390866707359243
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b33cebe1a0f717b6fda32aaadab669950602971c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.22790730662467748,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0038619293949543852
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.17039417678072985,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002623682838317647
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.15896419199407144,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002110741679844575
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.05835411226724025,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0023625109773989625
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.03655654766207606,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001311888413153822
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.03477199950268131,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001148170515656098
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.18500655628018325,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003313643303697083
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.13521687887956313,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0020736922184739505
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.12588142669948893,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001640871513930331
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.2169696601785429,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037256328539598173
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.16119268884778876,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00246649624417531
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.15048882231569044,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019906565024388205
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.1889717960368373,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06741726194279306
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..34f90a49e008c8dcac1fc97eaa50778b793b28b9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.22026933435692986,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004226027941921998
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.15168635082635815,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028519053998314037
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.1445324809151167,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023513187047355505
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.06276445362278234,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024682215594049096
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.037991873791024235,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001399765447448907
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.03606843530067028,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001164984602255506
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.17924178959744783,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0035990136677962322
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.12047726091128781,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002275634505946444
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.1147650004577283,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018510908050423905
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.20873376387414452,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004046132323079285
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.14298638872506758,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026905116077442096
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.13635671547440492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022228611437271725
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 1.8679229391280179,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.10987316164693478
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6addfd6b1837551da1133b6a004d765352bb6c3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.07393725587462402,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032181295487344647
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.046454406894951346,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002038618034193849
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.044918688464249894,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0017947926321333124
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.020800186309775436,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015975541170520002
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.011520777401595593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0008755975458470784
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.011254692924376551,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007414468996376601
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.06182358864059859,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028062754361036935
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.037078468066000264,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016222637598800273
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.03622514020157269,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014518847052011796
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.06969123514917439,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003062187931063394
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.043474473389648785,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001906693067415681
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.04201633885081397,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0016772951164631064
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.023688527977254865,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0035743692989931774
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c8bda2fb310c1d17cad7961164e676230d57ed9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.010706295526971374,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0013089961975418759
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.006817958817895227,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0008411163469920986
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.006735634553796344,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0007809237197212106
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.0026189086956420245,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005673572995320269
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.0016413611270950422,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003194112456571687
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.001543687945490394,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00028832955644989586
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.008831988452309125,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001131402041475174
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.005517158220882632,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0006948992894017861
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.005386364695009258,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006350514269837313
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.010075169657915205,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0012404555583104564
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.006456627301202316,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008077762212667873
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.0063447932832890455,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007407891319077604
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 1.2748239955674293e-18,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 2.9738057634526036e-17
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3268f0a5ac4537c3966c1cf23c98c06053d4dfb2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.13048152009975786,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029945327745025297
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.17865577403787464,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033881835064667153
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.13282629414914784,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025194987202168846
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.03018125254771904,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009701597241428515
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.044665240253023694,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001413086106707342
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.033116246346807675,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010039942081565677
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.10254296237637063,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0024976516791238405
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1412543271300389,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026741023650489657
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.10296911367521405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018747375341668269
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.12215023147917263,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0028728571931844558
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.1662465685102492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031683211934688273
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.12355432651994205,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023550817743274908
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.744610721855112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.16526552066121028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bda712af9935fed5be9af37614650ffedbdf68d9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.24565174971014234,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037427166308021094
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.21184490359364247,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027978242331130445
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.18959676949532062,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022208786444120037
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.064292697870659,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00223313142118904
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.05044989353111514,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00149356139860852
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.04555610212149096,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012842639383539272
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.19090785019208276,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031096615547551243
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.16233189417978816,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021666821757260154
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.14459077408940324,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016809419269475205
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.23112890188861118,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003571687360942092
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19926382429471512,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026476455852546383
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1779721097148494,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002089075825231665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.8487850954748324,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07399043126258031
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d308d69a77ba75828ed0feceb39689ed4d3096b2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.3177459530757474,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00400211201809437
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.23523949924863127,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027923501651691642
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.2255393163759458,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002203135213246267
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09785647917944716,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002624157152992091
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06533504155625053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00156036864094978
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06387789794595161,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013813734339407
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.24936441373065246,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034018286291394332
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.18082782092908864,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002197669620317046
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17360298467727694,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017280099368334454
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.30084337998110033,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038595850362637936
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.22147712923465016,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026291853666447025
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.21263985026820842,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020905527432427407
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.7186948773008828,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05546113136172655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bdae13bf80979b9a028b8a72c44676f4f76cb82
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2846746242566653,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004319925738331281
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1938876931146627,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030463601923653946
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19252198096891418,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025594189650328643
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.086213919205008,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025532621303566427
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.05487592547354139,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015509508090563054
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.054477366161976716,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013706432857531726
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.22546828575157715,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036375567676432637
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15015796002510307,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00240340751959009
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1494133193527796,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020035425127221443
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2694034648043568,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004158098981958554
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.1821584435675664,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028693565561164385
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18112869368220622,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002417009296022553
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.687306446962641,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0664578324203411
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c7d30d7563da28c3f1d2fe88a50d4edd2d96502
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.0960156493415235,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003616614563173548
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.06015169027512941,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0023503642149532013
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06084731695924829,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002176933000207993
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.02926487891196175,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018445908136531816
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.016390097756710017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010106857547182186
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.01686088972765567,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009404003204068355
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07801821469361046,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00303848388489472
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.04729130821614451,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018488779124782556
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.048152202763597045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017262084858342547
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.09073940139372572,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0034537583670396933
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.05607109134810165,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0021842218596914128
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.05696906006963692,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020412224032817106
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.04668959419048288,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.007153039873759097
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc25f5f2a0968f181d6aadf3b5d2c1a8060482e8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.016572326495735878,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0017108845418978154
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.009927306068576757,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0010319824087958868
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.01017010786697453,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009828813757946661
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.005362446572675429,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008727663926039747
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.002846016962133478,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0004240233213271509
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0029391304431817997,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004163042405148408
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.014042125577094597,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015259273674767392
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.008096887154439805,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.000857985946484465
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008341512107444696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000825303231689821
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.015986982643131028,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0016701967608221248
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.009428039452194052,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0009838146973353556
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009670630558683124,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009355088060122658
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.2603158162943564e-14,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 6.751733488101547e-13
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3af27affe2d926fac81c8f2a7c1c26cf1d006cb2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.08141174878886664,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001439622775683003
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.11192550497068104,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0019100391613226038
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.08553702561971455,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001355291261214151
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.007141249340697971,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0003903230357924652
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.011016516206080998,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007193905687669391
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.007670667921800801,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004129088476854741
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.07117531662005762,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011655127025012122
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.09968282512781339,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016088854064618629
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.07528888534904114,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0010842064028811886
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.0766197709818064,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0013526915170476849
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.10547363912496262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017901245298913224
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.08045200906728588,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001263370693113323
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.45919147551421197,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04515627730002599
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab67041f6032df87674146d6d18d3eb85d67a1c0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.11566837691197226,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015353140127384068
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.11410379757796434,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0015344257802797116
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.10159577693048065,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0011952248894988619
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.005802720511302557,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00036268341953713836
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.005939805394665047,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0004255365897056713
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.005177050428898058,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003248271761339092
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09274207592681537,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011966610399347596
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.09159973215721444,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0012088819142730058
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08095570189299443,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0008902535417571856
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11168791780519562,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014653968202923253
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.1104796475888384,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0014798128235735315
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.09821383093532979,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011443600495618714
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.43632620722433213,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03652700409738095
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cead20b593e1a68520b8c5a75422c10f8f46544
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.14601422897116828,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0023528068041928623
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.17757421521329586,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0025858960704315853
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.13853002039888562,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018231710614927264
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.02335655461073883,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001024065445599411
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.028259252819261033,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001110223234196
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.021415156800516547,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007744432940768205
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.11201526766555471,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0018577920473740726
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.13622889891032916,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0019324235004749345
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.10498997106963696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012843653383884555
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.13843833207278688,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0022359405028341195
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.1677188214159601,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024133512054743036
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.13100826999001244,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0017055637389061106
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.390261470828549,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06598656718700266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..17986aa7b19e62f2839fd5bcc0d7dae69b5d24c8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.13285396611786718,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0030209658760757827
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.1422872983892613,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002878644422451723
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.11404753732090826,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021481783154813357
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.02798134561881831,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001386829306669423
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.028032889930510667,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011691508536939195
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.02209148162686031,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008417227606383286
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.10397480499710599,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002432028827552923
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.11068982956126656,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022301779950906378
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08774826412280444,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015918396074012017
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.1253632873485129,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002862010683878808
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.1339058808413107,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002703503542054378
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.10726187500665701,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002007303392628382
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.465485886708219,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09344145038532585
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..936fc2ab94039ddb91a95644aef70abd8c4ebf38
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.039486863977905025,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0021598107276160857
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.03691619467473134,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.001837241505842736
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.03077167379460974,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0014760361121106862
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.010893541084177573,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010824871341449053
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.008219229554088426,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006725668226973433
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.007149055705615344,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005554564096594137
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.03222582701764484,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001833931334057099
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.029219757580315777,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0014381541985788997
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.02433445892307545,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001153157619824516
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.03741092237605496,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020645150808717167
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.034636931128603624,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017203814565781976
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.028933058276345378,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013845885404859834
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.022392125697331136,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0036463991883077577
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..60173bd1aba8bb4b313aaf2b8e0655f552a22582
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.005198934009239172,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0008983856883541296
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.004343154687994872,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0006911835975220482
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.003726273969867652,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0005744387153012822
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.001571113975338106,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005052873938474259
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.0009049370506867134,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00021201763153043962
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.0008390948687705988,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00019938162808952095
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.004366896062808353,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0008005747263531927
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.003439980473577673,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.000538456144374977
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.0030007686487377285,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0004713098658540184
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.0049827592266209475,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0008729306567735783
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.004153089344111274,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0006636613143429368
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.0035454836584688973,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0005472430761164219
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.7915004290523795e-24,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 9.569603854109237e-21
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..08944c296afb77c31a80f057b559a22bc41881e6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014922019523732954
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.369,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015266698139154615
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee25d8ea1ce4586bec15c894c9a0ad8998a21cde
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014976758771620337
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014998131348402707
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3c41838986c766163d217e8787c1469bb55194e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.356,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015149042659306623
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.361,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015195720118175111
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7930b0ab6ee948c236197f149b79220aa4fa5334
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015039986742055235
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.36,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015186527932040115
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..15e7f4942c3b01a9b0b1201118133419fc778fc2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01505026612756443
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.37,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015275252316519359
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..826ed22888fcdff4056a2730d560b17c1350115b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.35,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015090650341444236
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.366,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015240612726405747
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2211c49e671e390e34d22312de9e4a0433cfc88
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014922019523732954
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.309,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014619600977206494
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcc18115bd79fdb3c43a385630816f3dcc02d663
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5182a6d65875e8491f9d84a7b49f2daea95d8ed
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.362,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.0152048409129195
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.347,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01506047203170662
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..422170fb3665105142770670fd9f0b313ef94926
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015070604603768408
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014955087918653596
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..98a1d732b14aefd6ef31b4458fa081f94858902d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014998131348402706
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01497675877162034
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d4259981395cc4aae78d59c5c8e8ab3690cdcef
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229868
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014944140233795028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b6a70003c42249bcf56c922560b8422a9050124
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.366,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01524061272640575
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014899597242811476
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..410f05511f1b270b09c99bdb5001d0d792eaa6ba
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..802f8b626d174428ace3b81b444f41c823531503
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014998131348402704
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014944140233795023
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dedb5657c2866f54ab364b8e602541fc22d3b2a9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.349,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015080663991563102
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015050266127564438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..555a036e8ef7191369348ae59bee8d81da980c11
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.347,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01506047203170662
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014782913600996673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..217e2d9cb28c6936712eb3ff952e3e8d0086efe7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014818724459095522
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..78d0d05f3651e37e5bcc2057bc561674515a7313
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229859
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014794927843348635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3ee3949c9713eb3b803924ac74d126e84cfbc69
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014899597242811483
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014899597242811483
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bcd5827e47eb0bdac73c85f9a3a1534320d405d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015050266127564446
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.338,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01496596071022448
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a923915b9f8da6bcbfe8fac479451ba3be1b596
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014842213153411237
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014955087918653596
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..800601648c38d54b7a1e0d9d8327a696ed40d562
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014922019523732963
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fb59145f0c6b97fb33108793a3b58a7da9c3d8b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014782913600996673
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014782913600996674
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7415cee9cdd2ce9d3c8e19cefb13945578e68483
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.357,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015158521721486769
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229857
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..424e6caad1262dc85762e725b8f9a2e2e794bbc4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d17828cd0e8d1509d16a4896459a6d2516710bd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795021
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014888272588203931
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4ef84cbc47e2b31f6a8cfc1f08911ff8afb7a0c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015008706182121728
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.344,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015029633724408947
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..163f7dfe19319380d706b44f24e101381952a510
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014899597242811478
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014842213153411244
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..79b8557620a274be23493238146d9e07c8239c76
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01480686473373886
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.0149550879186536
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..02c6f52492acd5fdb350a9d594ee408785cd0c11
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01494414023379502
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.354,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015129868238451772
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..586d7b40611b3dbde8c9a7f61308fe2b109ce500
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01493311749093258
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01487687202745673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d04c67038c3e66e784030593e7ae9b50cd886e7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01499813134840272
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014976758771620335
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d846b7eadcf484e0aac5dcd4638ad36f732473f8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014865395385928376
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014806864733738863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..61d624e1d5af8b2d2666db822e281532c44d1b99
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.344,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01502963372440895
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.353,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015120172605483706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..64d5e2f85465d126d74133b252fb9c030e26d801
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01497675877162034
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015050266127564427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..26fe571ebda53378fd0c118177aeb1cfe0767299
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014922019523732958
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014955087918653598
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..77ef8fa01619308e65d623ecc3f204870fe2ffea
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6ae0849d0d1c69751daea4493a06ebb574ff20f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014734079309311901
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792508
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..85a294591a0a1d7975997e2931a4405cf590480f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014853842487270334
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014818724459095526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f140f4bd1a54ddd0f80fe7f9a7cb24e27846ec9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.304,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014553205687950425
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014794927843348635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d32d426b94c323bc597784a8673dd404b24ffbc5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.308,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014606483127342763
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01470919305605712
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..018da09b594506fad3bd99e57d6a24da54d5fb43
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014830507204541045
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014922019523732965
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7611b7224fd4402c95a4692027821ac97b6dc4e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfc75b09bb9763ef604953e71d6e70ffdac152b9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01483050720454103
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01487687202745673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4b2a2f01d31811fcb780b90c9010cabf9a175d4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.343,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.015019206922356951
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.343,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015019206922356951
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f073965ee567e141e974b037da4eeae1cabe1ad7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014709193056057128
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015039986742055237
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4e84ccea9accd71c8ed2947a2f7b2efe141ff7a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014842213153411245
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014955087918653602
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..531604458eca4cd9a8a88ca586105d94537bb592
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014842213153411247
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.303,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014539683710535267
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fc2c73bb506cf594d3dc2cdb90fb7b2d45d9b6a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbcfbe96238b29cec2b271e58fd4d8e8a292eced
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014794927843348633
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014782913600996674
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..59e01a8b0e3903912129204532a03ab5b9a0817b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014770821817934645
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014782913600996673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..46fedb8c801b34e60194fa9bb51ea9c57a4cad83
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014806864733738857
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014782913600996666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e58bb11abc7490fa089c32dfb3f0f92e0cdb10e3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014830507204541042
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01477082181793464
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..34fd194bc5fc26adc2eb908fb385248f16a60b0f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01493311749093257
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01491084616422987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..77e60a4e58694e103b53441c03f5c0d6b29c8d89
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..587546b905ccff5d9966eb547e4a0fc3dfaff838
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014806864733738863
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014830507204541028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..341f6fb637b9f66d44dee98370933f4be8832308
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.343,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.015019206922356953
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014976758771620342
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a36b5d40a3c81bbd9230a16c3085c1d5ba6030d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014899597242811483
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014998131348402709
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..791264b401887dc303c6c3b030ddc504d6b2fdcb
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014842213153411245
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014998131348402707
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..612162807e7ea15fa822203cd1377ffd99eaa6a5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3383333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013664144006618268
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.33166666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013596836729485166
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2613c2d4e2bba5551569efbd780af16011bed9b3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3475,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013751753243291852
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013728421539454885
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b720126507fa5faee796bd605da6f62b436f284
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013605417345710526
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.33916666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013672343491681815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..525f5b0798c999aac169eb3828431939d519a63a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.35083333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013782212417178199
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013696658778002515
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..813ecfa5f7ba0be9aa81c71df23baa7edbf79db4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.34833333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013759437498874072
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406394
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd20d27fd44e15ec14f27172b4dda3db10e88fb
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013630871843821476
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013570806258433623
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..de749d7d86c2e8e66743dae98d3990a2a416954c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013630871843821476
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3175,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013443538681348052
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..951a99274d0cc2208f380763b55f63d20dd1d470
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3f2b8878df6f7bfd277d524f07a39d0e18c0c48
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013526454480351021
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3275,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013553211167251951
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa20d6d350f545cd2eac3cb96a0efcf7f7f65066
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.31916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013462309712005134
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3225,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013499258621103247
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9ca1e3fb0f9ee0152a896f814c1007d2f53c5a3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3275,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013553211167251946
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013544340907003663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec6a54abe90a99436089d2b150002b4c68d23709
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.31333333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013395739415639082
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32166666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01349009528298952
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..36284bbd4a59f873a300636f6e04ed305c693992
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3425,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013704669762934732
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3408333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013688600793296934
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..366f1daa13aeca06730be45999210d4549855601
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe47e9f9c3382414bb263d998c6c16effa34728f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.30833333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013336721143136469
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3075,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013326707242912048
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5ecf8566b081b79717abfb0a8d65031863117e5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.31833333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013452948996996289
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.31666666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013434078660827378
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5f2d52419bddad3be6ee981c36e209120c3a9e3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32166666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013490095282989521
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01347162092976915
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..00659099d4f48046ea4e3619c30c0dd4b0b6531f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013544340907003665
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3275,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013553211167251951
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c306293d41ecee52b44b75a61b78473c80f2419c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.30833333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013336721143136467
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.31916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013462309712005127
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2392edd296970bbb25ccb34fe1f800a7fade7ce
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013639261190932887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..95f73886582aa9f036f9a093787ffe5bd4ff1a33
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013517438120881636
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013562032919529019
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..46ad951c85e04f9bd8b2373ccd8a64ffd1ea68d7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013508372867300215
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01352645448035102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dda21ce32676d46515398c30b0cb693186be7af
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013526454480351011
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013508372867300217
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..682c500ef610a2453990ecad7d461b4f80321e31
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013639261190932886
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013639261190932886
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..37b9c07e0faa8369fb56a170651d0dd9c9ea2bc6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3475,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013751753243291854
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33916666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013672343491681817
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc33f30876875ec081133e8b0a5aa679737c1daf
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013655897185463653
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf8e047f40de98fa47181fd4291b6656dd9199d0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.30916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013346684134591945
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.30833333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01333672114313647
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..21b062f85689d56d9c5d860273ce9bd4903168fc
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3175,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013443538681348054
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013508372867300215
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..276de6f049e776504efc5f7b86ad31c57c205d99
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.32083333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013480882752851553
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881636
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..54ed6c2a449023245c3bf8e7c1e63f4162b64389
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013588208070708997
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013544340907003665
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..53a6f8b687aa398d39f4bb0df4cfaf86e47c0d3f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2295221843003413,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012288926760890793
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2295221843003413,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012288926760890793
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..09a08a1cef628dfffb6e0a6b57bf191c57ad8523
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25170648464163825,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012682496334042958
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25170648464163825,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012682496334042958
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b6d5b9f1199c93d6110f45da5b54fb301dee287
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24829351535836178,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012624912868089755
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24829351535836178,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012624912868089755
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b81f39fc7f11608a2c598fadd9291fd3d8ed69e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23293515358361774,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012352507042617396
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23293515358361774,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012352507042617396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bb2145b5c03198ae5aee8454d636592545312eb
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24146757679180889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01250656483973943
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24146757679180889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01250656483973943
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..34e8d42c42cebe10197158dcf9504134fab68aef
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012476304127453961
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012476304127453961
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba196e3dd4d0ee852be6465e8a23958f4f55febe
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2636518771331058,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012875929151297058
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30716723549488056,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013481034054980945
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6358a9ead160cefec17c0f67e56c14d4bd362a51
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.26535836177474403,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012902554762313962
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30631399317406144,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013470584417276511
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..43c2bde5f16eaad4909a35c5e82442d2311f648f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.26621160409556316,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01291577478152322
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.302901023890785,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013428241573185349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..14bf614b3feb71e411abeeb9cde02224b97e069c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.26791808873720135,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012942030195136423
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2977815699658703,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013363080107244489
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7eb3032787432af70911e49e072d72067284cc3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2645051194539249,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012889272949313368
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2986348122866894,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013374078615068754
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3eaeb110f49c3cf83ae53841ddaeb6b6b4b2175
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2525597269624573,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012696728980207706
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2935153583617747,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013307250444941129
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5876de279b831d0adac2c2c124e899a251e8719a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2380546075085324,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012445770028026206
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2764505119453925,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013069662474252427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f616ffc1bc23827441d28ff6ddb4c005bb24e5f8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01230492841874761
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24914675767918087,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012639407111926439
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4803d09e9f513ca9037af5c100d276f6b64be0c2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2440273037542662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012551447627856257
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012610352663292673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc391702073b7a963a81f1efcb3cb61aec5108a9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012414960524301839
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2627986348122867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012862523175351335
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c3c2b16061ce5ac302c426e583707a21a0efe52
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012610352663292673
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2713310580204778,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012993807727545792
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c717fd669ad72b1eb4cc8db7c16dc827950051db
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.23122866894197952,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01232085883477228
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2508532423208191,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01266819862131543
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..97bcac61dbf156ac5a626ece3922cdd708199e14
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012272853582540799
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012272853582540799
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f520dfd9e9d01732f8bdb2c6699c0dc4318ac3a7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2431740614334471,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012536554144587087
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2431740614334471,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012536554144587087
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..de6e3e233c553bead9e047ae2312b16a2649f623
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2431740614334471,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012536554144587089
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2431740614334471,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012536554144587089
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0be3466484d6eee5eebca3c5e46335d5854f55f5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012414960524301842
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012414960524301842
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2bb7874640f492ea5019281b5353ca839719b9f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012653835621466646
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012653835621466646
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4420341c96c1cd3ded9c78a64f8d4d09a69fadcd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01261035266329267
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01261035266329267
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c5996168177f6ef0c49695c621c4993fe0dd3da
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2551194539249147,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012739038695202105
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.31569965870307165,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013582571095815293
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..52faf5c9f34aa49d734925328a9e34d173e7c6be
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2627986348122867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012862523175351333
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30119453924914674,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013406741767847624
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6caf443527017202ad98af787ec62b49e1da07e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2619453924914676,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012849054826858114
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2909556313993174,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013273077865907581
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..409a882382f866c7e5116da649f53780d271633e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.257679180887372,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01278077056276841
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2815699658703072,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013143376735009014
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..aea792c7c483b38367dc6e0b04d372a2bfe688cc
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2636518771331058,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012875929151297058
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2883959044368601,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01323839442242816
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef785a6ab3f340563d12a2574d1816d96144bad5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.24914675767918087,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012639407111926433
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.28924914675767915,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013250012579393443
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8545782ee53ba59fa100726e0a2f7082e3e5064d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2537878787878788,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008929657065808292
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2537878787878788,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008929657065808292
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..02dbd237210e9b7d48db2205073770938220f48f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.234006734006734,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008687500578023184
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.234006734006734,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008687500578023184
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7393646ad572b74e71d4f72df8aa1ba7a80f270d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25252525252525254,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008914948991495704
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25252525252525254,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008914948991495704
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1da9d25bfc2a4b296b83980862c0a1fd3d9ade0c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.257996632996633,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008977970005203404
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.257996632996633,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008977970005203404
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d208b403d49e05c8a50af162c2ae7cc4a859e1c9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25547138047138046,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008949113551665567
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25547138047138046,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008949113551665567
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a6b5521dff6c41ca5ade4e40f64d4a8ed4f5d22
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25673400673400676,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008963590834042409
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25673400673400676,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008963590834042409
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..329ffc831c991c33ee2fb9a366026b7473e4c2df
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.36153198653198654,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00985850654316206
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.31986531986531985,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009570821820573587
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a141c42e6d74ff11946aaedd480c1b5b191411ce
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.32996632996632996,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00964831157424104
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.3143939393939394,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009526702423162905
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f89c6fd0ec653b101e8f9d638f130af44e56253
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.32954545454545453,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009645184190953856
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30387205387205385,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009437524848293738
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b27f2c7d2030ae9df2f01cbce65c15c53ef4cb04
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3228114478114478,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00959395022036674
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.3055555555555556,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009452181213593461
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c616bda493f3e5e7bcedc987765d1a1907933c74
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3135521885521885,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009519779157242258
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2988215488215488,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009392656275408728
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe938aec82250fcc7e0a54a53351bdadff631a9c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.31397306397306396,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00952324533521551
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30008417508417506,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00940400055851335
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..40132c9fb3c70f100ff2f550a3443e212c837251
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.28535353535353536,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009266280584997748
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.26346801346801346,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00903915737449771
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a9771780ab0dd5a902c2b665af05a8ac487cd99
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.289983164983165,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009310840970769035
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.29713804713804715,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009377397867796849
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c88f19a9046354bef86282adc038d6d5c9c256f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.30513468013468015,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009448531094163912
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.30303030303030304,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009430140669278962
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a339c0f1d6ee86ff4360b6807e9bacd2bb2ff702
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.30008417508417506,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009404000558513356
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.29503367003367004,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009358110551087423
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2d9748168c1bc2f6ad7e6548274824a87d4b632
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2904040404040404,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009314833302936285
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.28703703703703703,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009282621598983073
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..17bd7cd0bbb1a94c8d751d45ee3485e935452175
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2840909090909091,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009253921261885768
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2887205387205387,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009298805565435513
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..066b6f7d7a991b41d23ab8f86d29ae36ede3f01e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008910024163218191
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008910024163218191
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..102a4db79e86b8d202d0f097dc421ae5fa6229d4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2361111111111111,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008714480491711288
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2361111111111111,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008714480491711288
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d731f7d03ea28d59f109704924ba429d95f69d0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25336700336700335,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00892476542452926
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25336700336700335,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00892476542452926
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..30421db2fd20f7ef3ba21204f4d5901c928ba44e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008885233166386385
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008885233166386385
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..acf29d14a355cacd662f870532987437fc4baf31
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25126262626262624,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008900141191221646
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25126262626262624,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008900141191221646
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f198af31089c713f5a227e2d8127a746d9f97a6b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.257996632996633,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008977970005203402
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.257996632996633,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008977970005203402
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd45bb1c53f253c54960652a35965e544eb41ba6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3371212121212121,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009700146509130073
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.3181818181818182,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009557408782506372
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b828146e1580d28217b28af0fd6a52758585fd7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.32323232323232326,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00959721864204534
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30976430976430974,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00948817285190372
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9be552ba2ea86305ad4a4d758efe5e5830cb075f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.31313131313131315,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00951630387930953
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30303030303030304,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009430140669278953
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..670ecc03e26fe8fb737cd001c6183f5b8084f541
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3194444444444444,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009567482017268083
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30345117845117847,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009433837434252272
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..639fbc4662eda3513828e361d81fd43f9182e022
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.31397306397306396,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00952324533521551
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2967171717171717,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009373559492986842
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3dae9ef238fa1deff9319ec1f22a7beb72ee19f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_arc_easy_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.30765993265993263,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009470292575831185
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29713804713804715,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009377397867796849
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..104857215e51a79109dc9671d50a880ea51d0058
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.6006666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00894326942995515
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.627,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008830798057449147
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbd4c2ef2daaac53bf08c110316d54c19de3c7ab
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5813333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009008626314760201
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6046666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008927944837940472
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fbb1b38fce25b87d802138fe6e316400aed5bc0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.571,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009037711366393888
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6023333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00893695992571691
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d309db21d09e3dec6eae3406b4fcd781767a03a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5713333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009036836097555085
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5993333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008948239303079452
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..55275111c9285f911b5ccc27c2078978dc105278
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5693333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00904202497793108
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5963333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008959169522662576
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dc84bb99f0f52a5a99ce7dbe41d9dcca0d88a67
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.553,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009078792586293545
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.59,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008981103499757514
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc1c654d067637ce0be30f81b096bf641425fbcc
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.564,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009055127374988174
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0924924b02cea5c20bd749ebb3493f41f33c090f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.541,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099483512819305
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099982269204863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..988f1bb2111ff892416300cd0f10bb600e20694f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5513333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.0090819853069321
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5443333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00909427038138736
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..67cfcce83efbc235de17d3b5ba3e431d6182a1b9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5733333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009031496556538196
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5623333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00905900327659221
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f46b709c61959ca0a1db5ea12a4a0adf58957959
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5783333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00901748678876912
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.573,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009032396953831096
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa478d3abf234955363dd076fa2c82edc3dbf7b7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_after_reading_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5756666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009025076316539064
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.565,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009052751926300881
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..47913ebc3c5a64eb51af7b3e7671a8bed11aa64e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.6203333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008861873799148995
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..680f79ebd2b3c1db289aa4782644577667381028
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5433333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009095877403306732
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099982269204863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..76052cd77fcc3771d9386d117c8fb01ac99847e6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5436666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009095345834327867
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5353333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009107405418833935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9acf6baad6da7a19bd3ab9635f17eeb0fc6fad38
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5386666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009102888762598252
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5356666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009106972161130876
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e402e1f0ca53da4937f4e8350c6e7630580a2f3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5473333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009089227499483247
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5346666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009108259669413834
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ab443d5bb7a4f10b1a9203a0e131804db4cffe1
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_exercise_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.55,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009084465266030923
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5503333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009083851457629933
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..286b00e4b424cbb49d4b6a5311c6e1189579f2d4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.622,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00885427200344005
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009113781890088806
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..79cde219ea03113963231f81312759d87a01a150
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5413333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009098980657278164
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009100967487199725
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c712c19773997d4415581e393d7b17d89dacd6c2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.538,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009103824830376474
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5343333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009108680663441197
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..181c3163d27f8f9ea8bfaf2218705b75a34a936b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.557,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009070708861664755
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.542,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009097962646004983
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9d49441913a630383dbbbdd66d700381632e7b2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5636666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009055910870388479
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5516666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009081355012045532
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed3f8773b131be64f0fa01da9010baff6ce0b674
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_valid_binary_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.561,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009062029213030572
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.538,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009103824830376474
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4f37e574bc8aea93e1268f0c4902d48273881ec
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6233333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00884811049411477
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008846558976258922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3157853965a0495d8fad7dae5f79e90606c49986
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099982269204863
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.541,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099483512819305
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6efdfd70c48f4aa1ef110d0395efc1f265f08524
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5673333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00904706345689798
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5836666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00900149831714761
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f2daf21d3d7f59e2a8386649a77ba54688979a3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5686666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009043721169619542
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5846666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008998379972670814
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..099a51f0cec6324b3735c51c53b18e99843c7844
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5523333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009080082050148014
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.578,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009018450207660424
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fb99d084cd8396bd57982193814271e26547d8b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_boolq_yes_no_question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5456666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00909207019506541
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5643333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009054339719898379
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b972505aaef38dfc74f3c70d7e16015970111cb5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359538
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.1940928270042194,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..055e1c7883d1ae3cca9058b0506f97a63323e8e8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.2653673163418291,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb16be53877d310ca3dabe6cd304bb28a4dcd2ce
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813058
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.29992630803242454,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2316d06e20e5c3d5ebf92b1687e8ee30ea50dc7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.2861685214626391,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6617a87a072e00eb4544f381dbfd7a9e10c2318
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.27619047619047615,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a1fbd54b00059b106926f417089b4b201cbb1ff
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_GPT-3-style_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.26430976430976433,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc4846ae80c7eb72a26c8a0e9e769d7b7f28644f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359538
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.1940928270042194,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c80fb07f60bc62aa4b7006835ca072b95acf9d88
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8295cdfcf2861161361dcd582c824ced26ddeee0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.3349371825715049,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..02c6b454009619b61c7531464b5f777a2412eef2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.067031892279424
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2690727699530517,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..69a2f5be1d45dc4c1fd7750640b4e89c1d7fcf59
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.29072681704260656,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..686cc80fea46646d0e2e8e6266fd7fa2560eac64
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.067031892279424
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.24773139745916514,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..641756c9c5db9684dbc15790b8df185a54d04cb2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.2857142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06091449038731725
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.20334059549745823,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..06a4decdd18c0bf43067861e83eb99425bf2ad41
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..25b5f5980bac1c3617a3d126d320abcf610a1310
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.33948717948717944,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3fe3e83610918cf27cc0ee2cf1f16c6fc762e07
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.5178571428571429,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06737697508644647
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.35185185185185186,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ad764b9783a07113c6f49b20f085289a7c7c1f2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.31111111111111106,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..45bb2c0d198393926a2cd2bebd60dd957edd25ec
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_can-we-infer_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.29060665362035226,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..46db6a85c20924c8f123602b91d8fbe324ca779b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.2195121951219512,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2425be60b2516a94908342eb871379ba39f530a7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.34556087187666135,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..94ce6b49cfd275b4a9507e5de5559f9126ec756d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.067031892279424
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.29448329448329447,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e87d3d50c23003c5bfe25643b54ed26c26eaa3ba
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.2330246913580247,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..97fffe941df13d5f49aabe49edd14f1fbefde2a0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.5178571428571429,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06737697508644647
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.25267737617135205,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bff52e97789fe4173826b7ced51a9f1b0668fbf6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.5178571428571429,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06737697508644647
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.25267737617135205,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8540e12ee2935f3376ed17662a9403846901ddf
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.2857142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06091449038731725
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2064336372847011,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..86ba2e30e048fc27444aa5796b9e151ffb5ce429
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6942e6e5a58bab38c96eb2936cd843ab7beed56
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3024109014675052,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbb0e2640fc0923e860255f950cd007c8baee933
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.34175084175084175,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..645ceaf65e60b42c443a8814419842f643783daf
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3263888888888889,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb99cffa17d91623c38f5eb145240f41680a4054
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_cb_justified-in-saying_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2931392931392931,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fbe5b8880de6a33811841de4dba108ce008c051
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.55,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..37a4556fdfc506853015d88da76e57090eb91634
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b538d7df03182f3f4555527f7e182be7047d9d2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620333
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef1b15aefa88058fe484a9edbb5d57f8db1501b6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956912
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7fbb8ab26e0b2cb0db85f68a5c1c7021e2a26bf
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4d5f0d0d5d4592ebb8730dcc78d51508ed8e4a7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_best_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..78fbd54df42b555b6408684c3f3ffcead88693dd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.57,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04975698519562428
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..99b79c5425af4507d93ea113e813aec02f01404c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.49,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956912
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f660217efd3ba520e19c0c6fbd86b09dee4db77
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04943110704237102
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c000d5f443efc51df67d20e1eee4559f2008e20
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04999999999999999
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f87b2e62cfc16af56276cc82cb3d23164bb81996
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04988876515698589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e83df27b3c0c356de036ec383f9fe22c8632ed0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_cause_effect_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6af9271a7ad7a76ef9b39fc926a24913078d2952
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..750bbf74120b0c8fcd564e22767dc8b9605d5226
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.048523658709390974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..548e197203cc2fd4c9b22a714bb8362970ad771f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04902071300001974
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.38,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.048783173121456316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a19ff1653d4ccc79d225a594c6c7d5bc08e919e9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd095b6dd66da32722728bb5ca4b44e2a81ca6d0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cf193b90b6ae6844c8d11d90b189f8659c70b46
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_choose_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04902071300001974
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..15a2d7623e63bf07c4f456cb4ec424158d86a624
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.61,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04902071300001974
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.55,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049999999999999996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a252f5ec43b24157b5d3e562438805674323f23
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba90fb3f88edc338e43e1f47d5c61d6a2134777c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04943110704237102
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9412886b475fd59034e13e5d0f0085cfa831ef4d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049431107042371025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..902a0ec9afe4d7eaaaf00d9feb24250b3c695b8b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bc6ae12c195bf3c48fda74ce4159d2d2d712a52
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc6f280bcfe0ff410c610079156dd2e82fd6fc36
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050211673156867795
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..138246527894470bdaf4672d83d1c68a6d55c2b7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050251890762960605
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04852365870939099
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..685880abf6272ec7068e4c84276f795771ee4437
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b53b44b317f981985e0f86d2cb01837118ae2279
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04943110704237102
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049236596391733084
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e2676f16a1d39a3913dad8e6e9319163e3b3e81
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04988876515698589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..29aca10924c07fdc24f664cd5cf1b56da0044d7b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049431107042371025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..71e54bbb5a41c3cd94e5294cbf067e446d187b77
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 3.0195237412009357,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.06083199996700688
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.1801203066264519,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002720982526713175
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.2755817289379272,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003631624010853788
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.20972323261781503,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0028606817573562354
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.0667960442666357,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014395935463214797
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.10315461019309466,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002160644851825999
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.07799688762882317,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0016150990465315606
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.1408194558913243,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0019537355788883993
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.22086380856877,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002913704901414896
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.16577291549436218,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021511059705423764
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.15471647764935745,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0023187171223427163
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.23726074831351426,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003123412402969155
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.18010845014611573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002424941965882981
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd7897e098a47ab8b495bfba37af9d73d81ab027
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 11.453380248459945,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.18516188812979686
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5373318783599824,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003917154984030972
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.42172162556828374,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003363645302615997
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4476728484979188,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0029280576089271435
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.25820630689480006,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002905622919692422
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.19826349640259594,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00228926623954119
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2113574436167878,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002207577446796678
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.39371548089187824,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034038731394905224
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.30546374911268226,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027166850788240935
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.32549231149234764,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002487047300178349
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4422536962941675,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036803390199736026
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3448887088986636,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030186579678220505
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.3670610104063968,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002755900373058885
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c3bd98e38e862c4bcbfe8f4b2575c9132b3b0c2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.50498117442687,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.199944172481821
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5887790541929973,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003262651099040029
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4787709415006632,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028961391036440892
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.5033284407510777,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022482442873726256
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.29848800947384685,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028146889310476536
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.2386615975453258,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00227464540650362
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.25126695886715306,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021103093049905787
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4335431846872657,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003041932502672048
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3504156877121987,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025130404124840485
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3688360845444858,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002144196545004686
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4889683087018754,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003238205206471699
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.396234345126879,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002752773124023653
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.41705421018227823,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023359077868479123
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1a5b3bef3f71633cdf2fcb60200ecc2042fff41
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 15.52122027752411,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12178399428274998
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5943056234570641,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031613114217466545
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.49080579235224514,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002887432702643965
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.5138185171536801,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022075774721638454
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.3035973599869196,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00270545738266106
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.24895866810157455,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002353494705639878
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2600892788255361,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021258730379320663
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.43863053675479546,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029800420206917526
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.36085145822203146,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025572884098848765
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3778880532609749,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021589962574373165
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4956533881700253,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031521759054215863
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.40860648474620076,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027644229003361666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.42804012925193735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002324183115712361
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..403fad1097ea6986d07de8d95961fdfb79820a44
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 15.80040151989149,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16820667809122108
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5960169570684771,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003153282877006394
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4956677150438472,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028826068990134264
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.5188871554976948,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022652768418355116
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.30558156632064254,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002693196777521595
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.2524058554563755,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002376081098564734
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.26380022955746174,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021699688555991024
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4397322331548957,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002948840127314331
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3648882670279415,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025648519457164916
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3819640735718326,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022089567195582877
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4985280260096084,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031611778556326644
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.41435411543293943,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002812123109588658
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.4337762791584858,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002402885949071798
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d292180974adcfd46cbe29abc12d97ecf2e4ba4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 15.975544621705051,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1579004448764518
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5947966548185337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032050395613254807
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.49593447472794544,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002883113398307053
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.5188499671667609,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002275638577386502
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.3062471081766795,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027809428178784284
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.25287403669722214,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023773224583267306
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.26453287785708496,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002209135453833924
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4404441060880683,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00301839351177247
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3662472446966816,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002583976795011767
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3833627449905156,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022528184957952757
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4994143702924253,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003219053679193333
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.41531212344400226,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027683691583775144
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.4350608743780979,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00240185780147281
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..90b59c91a756b21dfd7563c5614ea81a5c366801
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 2.1585732992390034,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.057283755545401806
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.2545704766225674,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002778559965844538
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.40955804247489414,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0034632388293561835
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.30605314016841034,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00295652774844176
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.07997732391252783,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0013066533136759076
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.1258719249085403,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0019245060999262846
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.0953144635435075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014784395007114161
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.16837333279413402,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015437698900107716
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.2821392548746015,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021460486676796752
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.20529901859047706,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016623332234901712
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.199053080158595,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0021506096653628216
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.32312955789441217,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027345035801367985
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.2399648342595197,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022887681185580016
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..223b27571dc4934f6c9adaa4f5570453f9b679ca
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 11.125306912232396,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17790295386633892
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5569688477314088,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037048052877115076
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.42404460903081953,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003244565701809299
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.4560439580303458,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0027412591881031878
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2613191609571208,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002833212806742642
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.19573012562121292,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022433262821983796
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.21079598546546485,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021486250857995085
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.405669026220581,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003271384812666787
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3052491335790724,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026212753055562302
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.32935726670214416,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002340422818555527
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4548387827219991,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003512840404754266
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.34412912946816115,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029023101925979514
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.3708772964846722,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0025874295501989186
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..84773a54fbb0076c196cb928d3495e0751cdb2f4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 13.813032450428782,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.09230276426798453
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.6032650507763088,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003318432943559324
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4672304478737013,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00294371456779212
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.5006076709725926,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022915149769133685
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.30416572111597057,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002857287549126892
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.23167563766203414,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022815742048801517
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.24840794312750028,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002115487840852602
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4456942993345726,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003132022239080753
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.34237578293418114,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025028374419928507
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.36762233413078943,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021522899608404623
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4986399586926029,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032916563071556525
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.38468678128109907,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027331843934643553
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.4126125012797959,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002314858859168761
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..19ca10b65e5793737d6843bd6f870c00de9296b2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.483672743525295,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11929516126568702
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.6038758728327503,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032028637360418007
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4744733062907945,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002876028623643228
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.5075591407090586,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022251762998982765
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.3055894171560721,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027886849751003356
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.23717581869311993,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023052901683455382
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.25378239680253595,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021425356705422195
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4460832287476524,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030259872274363115
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.34878741185181833,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002498911308943595
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.37357349751082325,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021616271893979125
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.5004728706993896,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031965878124048455
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.392287327193044,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002719219158202565
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.42012395813675424,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023314918549654064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..65dbb8d14d0c9ba100db1817b161a2f8db5d12cd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.929762015804371,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12898047247308805
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.6022142558158895,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031609014593905324
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.47984349113713315,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028797812370297167
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.5110531953195654,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022411307696570434
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.3076095968625754,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002798498179123258
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.24178364515600512,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002329282560521692
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.25768372768308784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021668151991038258
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.445074501071273,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003000311479460939
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.35356822619623607,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025548952867894027
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3767138926335197,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002205434337178266
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.5011709897391451,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031888091946731143
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.39903846221733896,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002793611393672377
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.42503387255872077,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023845126436349852
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e6d088a2ce80d2fcfa2da566233cbc8fc15b7cb
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 15.223644297182389,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1543953907459507
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.6015027603881121,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031982719234700313
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4820792569040302,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028725797677984985
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.5126128110763807,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002267700753729786
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.3075111272260856,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002775456152522445
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.24398746828269524,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023580828733568196
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2593212430045562,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021871613636693333
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4448480337595403,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030234611940094993
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3549826197897851,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002543994777103483
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.37783699954395017,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022179761130291394
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.5017155875511411,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003220280562846132
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.4009107110812699,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002754771618482513
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.4268497329572666,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002388843734324405
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..09ccd7a8dfe02a674cb15f7f6306cbab5b3a5b72
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 1.954444748954989,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.051868039637637996
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.13769122808003648,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0017596998810661165
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.10639231990905271,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0013988711766301635
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.11526409537525575,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013631415982193285
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.07169075124195784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011707222728807847
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.058024124805966824,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001013674848977092
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.061308190194015844,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009528331425054534
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.13598668737421912,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0017358845797809722
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.10496985768149185,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0013804037318383363
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.11375684850778682,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013428955348118651
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.12448022797409529,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0016204095898696496
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.09733342412728097,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0013323499351542672
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.10487679476333724,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001278747634008628
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb9638eb441502503b745dec8668ee9546a781e0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 5.689076651457708,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10955679808497602
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.30205344791644223,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0038011993380253984
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.2591489728142023,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003578393084400893
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.264668584146837,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0032621365523836025
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.12740361481749896,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024147537412165677
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.10957433057544176,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0020988896354448804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.11095311869025824,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001983294663988576
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.24430027357651582,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027435596209212753
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.20571374170532966,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025447737588058707
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.21195460544874364,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002274999033300145
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.24692727270265108,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00332540047296337
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.21143151597592733,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030797887136175474
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.215850348956597,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00282606722123015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c48bf25f22502b86aabc056af554adf959dae116
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 9.361606353028591,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1451212245342907
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.40668806398214247,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004503127021297256
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.3430030141127422,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003924569338343976
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.35373295631398277,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036780575347303305
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.18924602497072013,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002957950517929727
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.15855760926664064,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002460433442821022
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.16320225416214543,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0024001272796585786
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.30907754734477744,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0033296328679694575
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.25878608350993737,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002856832446420683
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.2675200681652855,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026402526220161704
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.3342219925602536,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003958418609375523
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.2812305254099742,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0034020232164490628
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.29005865628135225,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0032174705384268136
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbbe675d5aed368880b6b6895e43689fc608d4de
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 11.414656994987986,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11270862432238564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.46449466056623623,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004562321628279579
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.3842244124504893,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0038568970386503925
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.40094473628048855,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036476150229787043
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.22687900450207923,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0030906636363050716
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.1855486684910055,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0025302721311659462
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.19370105271832214,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002470342190509859
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.34573316001805976,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034748050946930213
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.28542502655034385,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0029279054521511878
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.2976665060188424,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002731769606142919
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.38244492367758776,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004035511089277577
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.31615336301566255,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0033904198313919145
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.32978958530156677,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0032355902278707826
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed63ae37fc91ed1d039f8d22f574b90b533e0092
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 12.732397361679025,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.18621333707534024
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.499291122039099,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004476245438047599
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.40989488096064325,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00370623790346038
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.42999610320261095,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00351827199150819
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.24841091607353702,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.003130775311395358
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.201675737588955,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0025289410126734155
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.2117257310114321,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0024896096126455045
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.3689143198658064,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034839108392620893
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.3029948266133246,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002899192792529678
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.31737123063793404,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0027279652357510213
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.4138527816259857,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004010246149220487
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.3399362615672215,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003336886045704235
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.35637466690999814,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003195201907601668
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdebb850624ffef7f2fdebf4ad3bf6ed0096db
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 13.577169092929678,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2062668532660596
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.528290529517535,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004318140358067149
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.4316094452050039,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035832199286407527
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.4548582412316575,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003357067322622565
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.2671985172121451,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.003111440606188032
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.2158163830401268,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002528320168515306
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.22760620908222556,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0024740789360211783
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.38911274094824855,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034729933166221696
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.31774835943631824,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0028682350509974387
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.33465045341993305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002709680333869425
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.44003076242549466,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003953562938723459
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.3587907276906629,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003242670548648085
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.37843382736640735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003108090184096992
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c40c89cb5cc68c4645e0acae120065c6f02fdc55
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 8.153800130249518,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19718858686762447
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.14254055468502322,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032976710551722
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.21985285950008995,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004764464623700324
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.16625763147318187,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036077916351727156
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.07117779808817919,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018071269269227326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.11165748130728335,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0026568526220331497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.0834337726594754,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0019832478132023163
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.11919497082898337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027419572678360447
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.18525665107433087,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0040358655783698395
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.13948285594182427,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003021076305392557
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.12480961931955924,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002918263840706436
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.19394917883355808,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004289318267212664
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.1461396427981472,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003226960516681579
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c21f4a2702bd32dc0d2a9f47d743737bda483fd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 11.82816578683988,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10613794908694005
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5741208333690715,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032791825020937275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.43616299285547877,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029918336223318646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4695495704689584,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002364549723893871
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.27417940276408975,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002745882165587474
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2046937372687625,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00215463330838664
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.22091389030064756,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020601818538796947
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4168038088426164,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029886435691974275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3132001002805105,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024293669614656285
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3383298269843499,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002100531054867843
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4669926334976919,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032045235446491534
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3531706097054418,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027268177176508718
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.38079591964268383,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00234065536986238
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ef3308d09c915646509a3d4a4841695d04d6bca
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.210994141328136,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14395128449450548
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5957938406647506,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032189851910091535
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4666858600777237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029439118893739937
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4992077922612147,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023268998699713796
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.29925246097428815,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028135040853843815
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2311260083068878,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022842743345053117
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24745510003725912,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021577622824456
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43644170204107635,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030000260097151686
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.339812800549829,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024913510362416656
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36403535470109827,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002158659011059573
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4937561615312527,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031994326527389665
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3863879597392378,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002778922240841303
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4134385994397868,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002387535627435029
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e8ef48cd287121e6293d1fa9eacd074274c65c7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.758419275441623,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14846553640981838
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5969365877168988,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003176934049191526
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47321803649209376,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028949729129975907
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5048691947729075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022904626291377636
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.3021673329464843,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002773447672105312
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23749889827974513,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023473465141403524
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2531409526666904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021887540705134477
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43729156400302177,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003007804981399626
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34523784521876366,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025266731757362768
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36861382037574736,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002208485032472364
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4949310693592707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031940660026111094
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39247584122491685,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027974990919393808
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4186873543720982,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024264140364972063
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d5dbde08f110dc74fa41a5a0fc5afc8b7b73c22
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.9471441955308,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20049087654668654
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5964162408645288,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031670174643345854
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.476041597124549,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002870553977627495
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5068439147529751,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022702971756168133
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30318643893717323,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027688869090984037
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2391458905396845,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00231587546078518
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2547260269260162,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021693766012080536
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43667748453350236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030068254256241543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3473159987716765,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025367602883456315
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3700166988328032,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022206494000581454
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.495394213066299,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032160309119094407
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3958929166908767,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002827379825231143
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4212470848260628,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024487659358586895
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fcd7c40bf2f43b3826670513d58d005f59791f6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.12300988029553,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.27781770916239995
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5954584960080906,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.00325049366305589
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47888879196863693,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028429629695053142
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5094338990091782,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023180695212366305
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30380630927932106,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002835445529306852
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2402751342075056,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00226887033631254
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25642542561601567,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021909983236262304
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4360299975558518,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029988845638560017
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34982928906069083,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002474832286550078
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37228821193907236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002208419488038826
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4966033704013741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032394543771446537
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39901987621454466,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027430779669785923
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4247238232753859,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024321405233947197
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e68dedb3a06a7d1d693e058b0110b1688c9b639a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 2.4148460846552413,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.05036198012608734
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.10560357672768791,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019908255113940676
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.2194638304445651,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003995820797680711
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.1391414255890394,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025400979508730013
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.04417622381086348,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009448315430017289
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.09527403810619732,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00205880226103097
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.05876533984038407,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012344870500762685
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.08914439768173961,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016567906456468919
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.1883312467548728,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003488302660435668
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.11809718257668743,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002146842991910023
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.09300944711571149,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0017956015291556789
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.19303766058306765,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00359467107273429
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.12249168865764576,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002289868109882485
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..210eb344244b0eb606886849d726cfe8d67a434c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 12.297262313527675,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16385815790982017
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5839558387860917,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033331147772366473
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.44014655260365004,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030139756921077616
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.47525841358381093,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023583251642963136
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.28661390078191085,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002901449616488372
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.21116517306370375,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002214474291192635
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.22890209457362648,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021147133262873295
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4309350512738497,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031735439185340223
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.32067098855093723,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.00249989474809957
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.34766320387998123,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021748450214048288
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.48023193119114316,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003335283437779837
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3595547681362966,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002750841601478431
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.38918206086481305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023621524251225866
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8084bf0a96c38ab4e7df2f1279083761e6af4ce
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 14.483863606580098,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12131630196525124
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.6058141210991159,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033088115891164284
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4741213639409289,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029151815956458198
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.5066707261948298,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002262998972148006
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.31078761548465383,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002885785543521991
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.2393070319511852,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002287716976357731
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.25604760044687425,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002140031183397711
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4468805003217975,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031330138309882435
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3474439134529135,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025028002612465204
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.37185442092317,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021578801071099915
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.5014613640312711,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003302363397632726
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3912761034599153,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002741039283689631
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4185811499353104,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023421461714562
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e141d4e2fc19cdea07c39745f7ab8fec10648a5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 15.191517295510629,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11974969580866728
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.6075957192330093,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032637968261048796
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4812539130894964,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029149831551134784
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.5130996047904425,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002285615607319131
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.31513884201161574,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028405657336013543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.24627662242850876,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023207152571040795
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.26271197369374893,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021634028658777662
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.45074514333996246,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030740552400188395
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3560656787748778,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025774337186458807
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.37969768710882007,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022273418438245704
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.5067196286957013,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00327275943788992
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.4005630163928573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002774183370900057
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4274159914050653,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023927259878927885
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..592ae794fa1bc5f909700663cdb1b2d26530befd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 15.66121355308185,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16746194817955856
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.6047624718754928,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032141654509862407
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4850925511554926,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002903801146577782
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.5159411031234575,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023173099434433014
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.3140652011020034,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028454442972433254
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.24919016906797625,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002393680033053097
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.26501632949202497,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022417359787710793
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4482536974201123,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003084926330807341
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.35780889315761033,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.00257596254524435
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.381084835759455,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022819361864961927
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.5060122437920864,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032490282463882147
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.40566476036911275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002823543129525192
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4315449793197658,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024568419006428172
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..871529edb10270b8c43da852d5c0d7d13be629cb
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 15.723080080667856,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16941632514539806
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.603309962822727,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032242188989074585
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.48695983010677835,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028628867830855025
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.5168974474939331,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022721227951672234
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.31312278266787735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028270590416650445
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.25037720266691516,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002392884966417364
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.2656196908399968,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002228936591291352
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.44842365936776796,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030644240344688198
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3612862341962741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025851127132254853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3836187007598527,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022832459662179565
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.5050355873000846,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032681356379517545
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.4072970583395843,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028059842659635707
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.43255318470589826,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024535753742802357
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8d3a228dd8b216279d0a4fa0c566c53871f4601
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.10344716943873676,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0016259799248270446
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.26090880633509367,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038477185681414384
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.14637569517250587,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0022073444081297322
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.01490073234297703,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0006589770129935276
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.03873031105763643,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0017582245582100646
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.021257086751659823,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0009366192439889625
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.08104406326081663,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0011547653388531021
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.20585546458892723,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028600607223324866
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.11486954284177152,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0015736623341304518
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0831734971325524,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0012946845061314219
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.21156835133750618,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032138252754335896
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.11793220629910407,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0017742377889203762
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.7483386307387867,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.05365818544561724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..25deef9ad94a5102ab6c508292a25897579b2fba
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.10675646531069118,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0017079434120986104
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.21254174972245313,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0035442914228111582
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.13489677526242255,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0020188404213789704
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.008741098009427668,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0005494909789404459
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.019852346836983944,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0012824372215275344
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.011736790474052875,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0007323025837186397
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0786172237029301,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0012104869162365519
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.15650442119288974,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002520544874358908
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.09907990741492997,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.001390441723617538
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.08565732122109287,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0013227204476746756
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.17284931760421135,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0029211829599186004
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.1087833032639467,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.001606809153789491
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.5016993367817201,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.04376564683106677
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..346909b87aceaa26643a3c00ac1b619dd4f1e304
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.16549164819743437,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0034526382178408383
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.18742513953883105,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003806437750265605
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.16241776794419802,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0028848994027258442
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.024661798518168125,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016895454273242517
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.02739768677136293,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0016421467568835
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.02339429020564221,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014396242595328612
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.12663862154443478,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0026682593533234114
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.14121993779286504,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0027313372184720654
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.1232803959263642,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021296445004168572
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.12895564267261075,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0026773474157606443
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.14650288142341764,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003019787783429863
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.12639544876411474,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002202317812711371
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 1.1863991054607887,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13126259759197448
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bc749c20ef75b1b4f3bda2c34a186b5a0e15607
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.199491926427676,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004346987952839152
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.19499186373063174,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0040083275173924
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.1838191377493394,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003540784118642653
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.039281858500698104,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002301025224624484
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.03776716790343215,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001979743786030461
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.035667825755264795,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001908336809093584
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.15200765203777006,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003504702651308
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.14674619587776616,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030683749877275252
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.13905902806555284,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027923693461365568
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.15365853324218348,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0035033293697679137
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.1504264543019513,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032229831137894752
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.1412465961407836,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028176425939316443
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 1.927416551659746,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13768130191760655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4c4d922b889da39a95e19f700d971cdf364f3d1
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.05932877286153886,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0038035599974941344
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.04877768076364228,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0031918925275879794
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.04942769980269383,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003100893627673165
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.013087116001066186,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0015109923619292477
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.010914081354624636,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001152072609333735
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.011000794695724792,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011742366069366823
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.04606934608031163,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00304472521629779
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.03712032219694214,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0024797647071667997
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.0378284212328296,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002425844759650908
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.04670525345751398,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003059955056808382
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.03806261734141931,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002530887858069215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.03853556917570637,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024502420061577436
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.06705573747521477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.02166430236564802
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b657c583502db74ab746879cf497786c193d0cc6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fc378c5021f0647362530d97e0f1eab7ef44ead
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.14574516028104512,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018712457151061363
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.3449236724935976,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004320383977323064
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.20225260124930794,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0025001046641434125
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.033711341577120454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010598029070401104
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.08308586968025274,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0026382706337572917
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.04728640552761858,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014730864192413137
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.10980468800637298,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013446137122648245
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.2608506185694507,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031928603764024353
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.15245422876205902,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0017970754977725638
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.11677865105898154,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0015785067645458074
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.27805190370113925,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0038070316552884773
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.16232577634393053,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002139935162676087
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.7621407465701262,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06113609142697404
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b65dee798475c5fb47e24d9cb04b7a824fc36c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.163542980651184,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003081723626596105
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2810483758189185,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004490053395879538
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.19140416518663397,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002823344391795949
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.031135203534715963,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0015263295233267145
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.055979904675050554,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023805460524915123
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.03666889011459831,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015411558646631956
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.12253980798125551,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0024936905994992586
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.20840253955539761,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003332368535118191
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.14222920999260133,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021284776195994895
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.1285570844683415,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002531324924973325
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.22276474556133527,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037612883055859847
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1506050075529136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002294513585647674
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.5243771210089576,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0660301007465096
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a0ab582455e28092a1927f2473b8cb40bbf981d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.2525132293787072,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004048687072651401
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2514932935540875,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038107088016241307
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.2367541422956314,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0032909140337269572
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0575454237989695,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0023125091362061257
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.05665840523908167,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0021563440811016026
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.05361519189651879,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002025045267401813
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.18999003721752228,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003327472779717041
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.1883122635578014,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002983932837681397
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.17748346894897998,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026653380372031374
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.19185044567781862,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003315470542686825
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.19181396779702334,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0031212995449732017
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.17976252206146795,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026772946258546014
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 2.663637021745117,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1489764859030127
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c7e782d2aa522a0d60c2a5137dd8a91fbd89f25
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.25963319479441493,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004586241879090658
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.23447118873035172,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004097558867330643
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.23342689177585182,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0037455640107493257
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.06148774851494584,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0025664242616005484
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.0555392665292451,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023190320837292366
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.055403348824886504,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002239667742211083
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.1935508847274175,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0037493722409142932
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.1734702299724183,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003172612394305438
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.17288647337982307,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002961595593698407
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.19489189124727585,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003750963505043716
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.17573627157693272,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003277150547151035
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1744328426789123,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002981641820828236
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 3.0167779291606345,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.18715886411594176
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..652df80723f50b7f4ee08b434531bdbe7157c499
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.07096229489647175,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004454103631846769
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.0537747056537198,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0034117193891468585
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.05658003625774032,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034580616635425912
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.01908631835669933,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002058223769449137
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.014089456125343345,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0013631204130799945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.014915911149834407,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014210306358262497
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.05488232507714324,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003638011853955081
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.04007173199742235,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0025968781288273827
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.04254235209217209,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026641802157739655
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.05528538478732102,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003650048657550538
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.04060499621647696,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0026328638098159266
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.042987990330416365,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026862384531954266
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 0.06857179391271788,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.018883898270683333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8c53383f6c807f433ed0c8e367ea798925a9b46
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.0025216182263907015,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007538054243947512
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.0017753801346017414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005160214923546179
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.0020234066276709393,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005877963929551324
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.00031905987094666343,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0001753545101664957
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.00020940537449971414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00011249708102079408
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.0002512424682236003,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0001360024645875312
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.002044972230876781,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0006006047303809878
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.0014822793037785856,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0004337171048834188
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.0016644800581129724,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0004779782544389454
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.001991370172557742,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0005886709786944241
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.001437140728352026,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0004219329829170707
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.0016154724619355648,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00046538388224745525
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 5.985570341439483e-44,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 6.0145592778949524e-37
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a643a4015ba8836bfab23b16ba9631a3016efa82
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.14108556952980922,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0020956329882564906
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.32629804825161535,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004756174570847187
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.1938850065877847,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00274725146396725
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.03392541732989178,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011674749125079006
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.08137541100012088,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00280912091771862
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04697902966116714,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001582424189239984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.10765433273641108,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0015745202745112603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.25042455718394274,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0036806317265347037
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.14812739050025472,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002066618016210627
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.11258249311328207,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0017981489130128542
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.26182933742867265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004140212650666321
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15484724382764298,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023502913687865295
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.8671229080125915,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08065653508825953
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..010b4fa7dc3364fddd18a3fee7999b3055338b58
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1818208364925118,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0033811439409721453
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2952342650035309,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004323771519378331
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2072547434350325,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029185590817686906
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.039255544280436615,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0017970345435637744
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06401578769657801,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024164697736714135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04454429056503735,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0017367902881785364
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.13745172437510383,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0026008926362907463
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.22412058258654055,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033458865795404046
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15664138971417202,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002216089409249802
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1399352840204044,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0026319234043525545
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.23050320162406268,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0036405761255609054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1602204327991172,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023319587866224892
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7643751753804513,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0800024640836125
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..74e4a6dcc577f6a3172fc3e7d05249cfcc20fb1e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.2472505852052675,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004009850648659755
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2567441304949634,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038189444756239026
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23644320153754822,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003260843172949545
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.056866160489061936,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002420240118413503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05783181540065835,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0022927551175836464
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05354419366516433,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002106078230291637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.18570698243499414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003303016306234649
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.19264012368473676,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030469236090590076
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1770324612357477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002634383182795547
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.18746784009032694,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003301909604388057
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.196059514574862,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003205680714038162
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17924796506451007,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026665867584947088
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.705819603984223,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.14137667865787865
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0665587033f09a6a5fd4800ed0f3c751a69e79b3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.25083050440215976,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004340710144695701
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.24313830962431818,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003987068682586417
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23375535546317172,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0036448754340216603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.058302376165758424,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002462019491771468
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.056822850129455496,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002365353936727513
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05453694888387553,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0022209047576266894
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1865779794802579,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003475251260232787
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.1815458312356556,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031953873575430005
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.17375448749498826,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002893029439925038
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1872796185903397,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0034817440259912887
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.18253965500420624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003256036671696088
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17450339640702453,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002909832621947404
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 3.1173819151981363,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.23709102615992358
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..10b4834eefa9bab8ef3cc063d119ea6fe6e7b4e1
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.06668389338088138,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004287951717923472
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.05770356648149944,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0035956507787910136
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05594753186729229,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033676619143302084
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.014963307619343467,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016114455998580692
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.013741449593915791,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014260391208111062
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.013296321991506897,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013642061125793434
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.052652024080707335,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003605493741125939
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.044415851405158295,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028463083421698503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04310400237676907,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026639130697933863
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.053307788896004335,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003626638986661626
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.04533527259366541,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0029075081420929395
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04383519297886595,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002700540506868496
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.15136892255345546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.037709514860432364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fe0790ba4ade3aa008c94b91ce73b4593e8670c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.00273620898511371,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007798516293741871
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.002264558087005939,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0006160940919096034
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0024320284759234414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0006736354616386897
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0002196833564758093,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00011015503412814092
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00016199733180865256,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 8.184314454744501e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.00018583265458717857,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 9.342731096527664e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.002119565890814211,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005829990564075634
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.001776402682704595,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00046929937422078794
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0018932762031196894,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0005068809117616748
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.00220984304166733,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0006273058335995559
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0018523030709016646,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0005016261609450595
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0019743733562370912,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.000543487074589739
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 4.919560458856041e-39,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 3.5763240226586303e-34
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..082f0d9091497916ec774b1592f7a78affcdbfb9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.13877902954547866,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018329219241202802
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.33229446957580333,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004296754137342145
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.19326845352360203,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002465944989792081
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.02676204259801757,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0009319137963402368
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06705747449853536,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023932732420159494
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.037715758291965086,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013073049053882384
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.10017275416736435,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0012836317273353171
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2410010033017358,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031284342960650143
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.1395940328246557,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0017302666137824698
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.1111454306741116,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0015327186901242722
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.26752711604279944,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003721123931794142
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.15498057414846977,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0020821371700032684
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.3922923422113362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1085694848628761
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..265b357ec2e39703185f31e076467f0a5f059788
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.15438000924889625,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0024173163816097426
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.31099805974371736,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004369924984110172
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.1951883864682122,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002532666335194767
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.029200709009449446,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011802714510965404
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06385551767647898,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024020535837821445
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.03803777610925088,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001401187389723631
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.11349074308725517,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0018559801970629032
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2282612664387364,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033052644499383292
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.14290021004189724,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018761553752169517
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.12144057591718488,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0019272376236010539
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.2475391717657514,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0037512061755519796
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.1540465769798701,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002078015927158577
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.5418959887061783,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.11447597977337433
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bc42838c4892b24aca7aa1c71b22dad44dc8b1c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.21235407890938018,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0038584821309733197
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.2922415035902346,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004426499849196656
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.22345527223273026,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003099961120505977
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.048280692831307814,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0020950935786571852
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06626083711364002,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024510193668861795
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.05031306585865172,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019103069927276113
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.15845617681932553,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0031065349816888337
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.21629686584539104,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003363162795489695
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.165786008724537,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002468629343967069
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.16392354764710418,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.00308613239016732
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.22961193568282157,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003797944820865862
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.17351507856807527,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00256723897382965
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.087587201632086,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.18248784506530719
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..41e3b16518629847734a122819687614736e16db
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.22711875267734596,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004285693842994407
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.26630999509216713,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004480479149823449
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.2231300082597102,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034033752307014967
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.05345269343858552,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0024415003575837513
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06205142343969058,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024890375706590195
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.051537755471058315,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0020776538363222225
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.1703206182001753,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003447489549005928
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.19844320254005263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035032133447970293
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.1665572765578783,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027440313935594893
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.17403775435701263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003413819988075513
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.20793289910577953,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0038444261569951475
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.17187047667566516,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002787059466070908
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.50653501943938,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1003090971755565
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e0412ed4860fb9455b98f160c34150dbac1fb67
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.06414485296013633,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003867582234098727
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.0654670128693392,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003934205471736375
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.0581230949714921,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003314213174877425
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.014926376522099313,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.001564843306679548
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.015583289386150202,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0015885386879519472
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.013514623561041836,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001316872673175581
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.048961063263220264,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0030673084914120074
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.04867370780315506,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030019830239967214
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.04353495089271025,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002552190003567289
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.050693442956428234,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003120026662916494
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.05205657833407217,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032602925531358337
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.04571485896067389,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026608912164232407
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 0.36982267172015404,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08144270702165075
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7253b6be431f9b347d0121c262185afab58ef13d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.0026583117523739054,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0008091018808969425
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.002178705467661046,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0006454400480476721
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.0023608978474116056,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0007081747278920328
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.00045740423098913663,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00023890947709324936
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.00034383593345857494,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00016914370471971473
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.0003900360808394216,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0001966707194013907
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.0019692588599943,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005871311929633991
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.0016273983057241343,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0004734697044949343
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.0017521055869624688,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0005149050874063948
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.002125175022613541,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0006406715438574456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.0017610229095255533,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0005181385995086363
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.0018951492030044802,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0005633639594497325
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.2867055492649607e-40,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 8.25672229885246e-34
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ab8ac3a8857e8f799911bee92760bd1d19e613b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.14681218319635403,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001884064620986668
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.3508532628212256,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004480396836342542
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.20445551861577294,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002540631262237117
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.033083758715882615,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.001075607005127185
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.08359653371387508,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0028011003001917915
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.046773184998710506,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015164232120699853
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.10884861486262903,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013621357621873037
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.2618029187147605,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034269796199026133
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.15179708394425984,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018576395558564697
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.11718412599487378,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0015873957952529678
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.2820874940255534,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003951298591378739
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.16353825551028955,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0021770267011811915
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.814930387249839,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.11771121619409691
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..32b59e92a4f82452101b1e9e3d89d1eb666b0cd4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.18103718501348756,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0033473865145306047
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.28733295807206627,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004482174464490647
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.20309050137464207,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002889520835173405
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.036419012458562515,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016871059092369717
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.060311180548127855,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024245045640873164
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04098230590585054,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016199850679624347
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.13558174388743888,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0026523267315021882
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.21359154903856134,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034331297139569583
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.15089891356003465,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021828562095624793
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.14117090393817158,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0026656506352231435
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.2267992928621584,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0038282221891750234
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.1586506104203225,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023161718459253386
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.6291107349733265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09557892175976454
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..be58a462e5083363fa042dd31e1435aad4e94e08
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.22803370323125136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004013273187193158
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.2570614680151902,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003783252940020481
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.22521229342157734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0032317381563035028
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.05022811623323242,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002274677428345127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.053690747036890564,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002164698578030171
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04819699499570192,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001989176916835738
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.17227480496190953,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003234340611642025
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.1942078038523242,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0029673906336968522
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.16991437391265418,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025765631696274076
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.17464672535677234,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003215033545422968
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.19974282808154498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003175109736049393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.17319699662992305,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002605048780862527
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 2.17318093098099,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.11661399106251272
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd36b2088f18bde9d816828ae525fb06bcb0e2f2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.2309959894697129,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004585855209595658
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.23766904690279728,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004100502142008435
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.21933409248379987,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0037339658935042247
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.05495177086256812,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0025120253907978123
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.05330679489401812,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002186452458727165
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.050299856364729216,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0020740941892729912
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.17327034438097938,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0036171037821970263
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.17865802097677572,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031893640720526708
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.16438642156838906,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002914893477118624
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.17524258448363655,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0036164027764869844
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.18241558682117898,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003320041583026863
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.16684760046910785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0029393030131824823
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 2.616296095236472,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1539703441634834
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0bc5c2f5ff50e4243e0a2fc33629653834b9cda
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.058089805059648995,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003880608920827592
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.051805567759303485,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0033664922868554847
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.050341607953639705,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003194838646765286
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.012863518208923428,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016654669126856404
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.010423647977701705,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0011866496937258718
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.01044448138213699,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0011835889688472476
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.043703688081186784,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0030624480401055063
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.03767748319262427,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00246554753588428
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.0370337628617168,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002402644072481797
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.044789301089212794,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0031139172812709205
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.03920567863599592,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0026011812719479548
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.03817599729757219,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002470465261805821
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.08641483709606773,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.022651861523459035
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f29cb2b16260c158b8253fb06c88464d2c07dfe8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.001429388221841052,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.000727956993087229
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.0002449280655247975,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00013374223120064533
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.00040066185006150704,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00021090992858783893
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.001429388221841052,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.000727956993087229
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.0002449280655247975,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00013374223120064533
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.00040066185006150704,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00021090992858783893
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.001429388221841052,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.000727956993087229
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.0002449280655247975,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00013374223120064533
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.00040066185006150704,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00021090992858783893
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..40bc189d8f207d5d0547785c94a21cad251553b3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 5.867916213031358,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20628853227818275
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.07942228360422791,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0024044279301916396
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7054367698131732,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006632442357897306
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.1298804110387953,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0029630718347134926
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.06361144656013536,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0022825023684296315
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5472295953999456,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007985229704536755
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.10376354583297816,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002967905827484413
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.07863671663541807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023952286615590386
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6995132449021918,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0066928017177045425
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.1285735663218738,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0029483810070099646
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.07718983804306988,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002403037945378598
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.684699153715215,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0068762487858985465
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.12598534747867804,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0029624549245465407
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ff5f8913d17d36ff241139ec356105ae79157f9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 61.335222795778705,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.8622771243249686
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.6980665022626561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006567707150422072
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6593767143434258,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0071185255670560355
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6595089902989296,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006875712478457699
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5473256014844189,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00793225012107229
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5253208631681442,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.008110275114235748
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5247847660788326,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007965430836769843
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.6804243326251539,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006774860773805959
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.646527809236356,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.007333307687760897
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6459449348115168,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.007108094606646681
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.6845034339007631,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006741022000997241
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6485924063401273,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.007291349785460433
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6483381084295872,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.007060527023002106
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..615fc1fb013f21932fa06e8ff52f5fb5389a4d1c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 62.603402535205106,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.5475644169974412
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7181416985150058,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006252053466859557
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6760628114661585,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006824731568876389
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6802649706088744,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006555775754673784
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5695607426529345,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007695163044773544
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5432880806274307,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007893817566829577
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5459465123609488,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007745165296981821
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7019668030881252,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0064563320671559895
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6641579045487708,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.007044290418742941
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6674947636505139,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006791046114300905
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7050246857130568,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006423110240312336
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6660735324572596,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.007004108761350961
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6696194322813839,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006748257374785954
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bfa49ec476a736a8616e76a8fbad4a858e4c5bd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 64.06004196791218,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.0446093860205532
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7232440140186012,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006116632364498603
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.688581159330267,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00663873709207145
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6901181418640525,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006387092151812126
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5753318864237054,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0076304447710168035
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.554044381568116,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007810039162687954
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5544379445377776,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007668802228151554
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7068401047024229,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00631934193944798
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6768332672562108,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006862112088829738
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6774190073052743,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006628030197945172
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7107370870856662,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006294581178188967
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6788431055005977,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00681711472710704
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6797541072212353,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006576199106862475
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..913d87c65f9af871fa5820b32546bc0187ba9cd9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 65.55325196543947,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.1947127408461486
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7282001628299452,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006078864792051913
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.69880441201827,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006520733587303046
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6996217800102895,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006317598090843756
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5828059508699759,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007610735477678452
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5649714199883233,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007745097163747005
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5652369927255047,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007629351400576189
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7140909753910173,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006280916318478117
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6878212259946481,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006728703316848708
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6881606645893026,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006539431927546592
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7165909256663964,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00625485888706704
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6895293461590946,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006694901252546183
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6899882354334018,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006505404310157879
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c863d5016b0adff9e6aecac89a5667187451dd6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 66.64515267205795,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.061858736228344
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7294671013001317,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006065785974127885
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7053377398801107,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006417448274249115
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7046939479107203,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006233955812695059
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5861943307872125,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007582514378127309
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5693588626949632,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007747442919717387
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.569623452216584,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0076345097393273175
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7155258182519211,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00628300942244068
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6941393539742509,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006646700094687056
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6930681146662367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006476139258559119
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7181565430000993,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006242949773749711
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.696180538077367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006602511857250467
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6952120905322114,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0064310406471001225
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c103e3ee12b69e3494324c9b4017771ea464ca84
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.49455930359085964,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665133500637059
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.49455930359085964,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665133500637059
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..521b77c92ae9efa1c96e048dbc00fedf5f3baed9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665713661738877
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665713661738877
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..322b191bad61ac040725036db763bcabc225951b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5087051142546246,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664055982032842
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5087051142546246,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664055982032842
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..56472d2763f94da7613422e1583a551172bfa475
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5184983677910773,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011657837583818161
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5184983677910773,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011657837583818161
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b36057c77d3a97169efdab19ce5a2c0cb1fd969
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5190424374319913,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011657360703051447
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5190424374319913,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011657360703051447
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab8e9b2859fa2e758aafba073912b872fd0e7a25
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5206746463547334,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011655846995729705
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5206746463547334,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011655846995729705
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fa9c368b94dacd4e26bd6d1b1305a19c7c0af1d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.1485041698567138,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.007315927282004117
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.01974171386059608,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0004952361009360189
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.21534097998297544,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004166248254601441
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.03437889361344771,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0007987086688652439
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.0030525841083055595,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00014616090889610437
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.0367404892573986,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0018644594321576408
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.005373478710058346,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00024914938847529126
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.01790945229515576,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0004313558223644517
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.19829194789252308,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0037997755232370687
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.03125352104038582,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007012062341285741
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.0160655540543357,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0003987980222331984
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.18249084236327845,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0036813208394015054
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.028029841637843726,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0006415770236915785
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d72ce259bebf48fd0f81a16d371eadbc3879ea5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.29412714768586884,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.034360669972773594
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07368232247410113,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003164750969347226
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.15766787120459927,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004114679609697215
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07158904393328168,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024799328487658226
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.015588545852027387,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0013404934159168128
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.02749471159881858,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015931425342235485
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.01427616756655093,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009774976022813842
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.0613981722284925,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002639261551928193
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.13877990083713132,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0036796850148855835
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06022053549130187,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020697663168887737
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06279067333856057,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002781722984417726
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.13573872320792,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0036419851235016585
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.060445332342990314,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002126572966319134
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..96c5082e936e627c82cfe28d94bed4b8fa452d23
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.7257226104526276,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.04974444170266115
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.06838800622837515,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003415935337029084
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.07342671774462553,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033235156125216965
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.05613337557154679,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025574265364955565
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.017058473371860172,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001674645431479306
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.01682784685476785,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014448109257036853
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.013394834397133638,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001173056970239741
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.058968248594693275,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029726310561432943
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.06438738644094597,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002961784919389523
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.04866283010304908,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00224397834501404
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06049770286352172,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030730882420140623
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.06461911786967894,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029694396182963407
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.049582431239121254,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022971793124355027
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8187ac3c77f38b0686f697a2d295fff0d69cae30
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.618645347711144,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.06077101383767482
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.08132178370310313,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.00374448816425401
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.07900250321756425,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003360802844922738
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.06473304019823428,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0027236205074401805
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.0183620415895074,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016611524429967466
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.016974165028495372,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014683224364738306
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.013861400466696013,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011327812094035596
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06893019272931476,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031686455340987937
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.06953656378352016,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003022144575602847
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.055560374775413285,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0023456775505970827
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.07140577564724335,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00331410052405056
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07010282395858061,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030213293550535717
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.056891001156107575,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024083482693287536
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8144cd114370709a6b7d16bdbf0dc6674396c4e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6316373732726734,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.06439416572051122
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.09702203366548948,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0041167329926756905
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.08876078044435651,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0034053343174592437
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07655031903659895,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002949142358861536
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.023735273095661872,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0019962706229605222
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.018705153963296396,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001528405714423096
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.017184076709079645,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013681327172434656
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.08290890093834904,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0035342470286738145
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07811395555950774,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003073914358976181
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06625753359506499,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0025972013724176652
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.08594629250205217,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003694845579474083
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07970107699146455,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031266107330673583
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.06806949198182898,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002661868418015175
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ade909957d790f5ad77b5e657170b29d09ac7e7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.7515007260497569,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0607371553698586
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.11041227742052115,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004319670141318303
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.09875052543479973,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00361264598137581
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.08641128265874654,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0031216881179551023
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.026424633415174224,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0020835974054162335
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.02144585619916054,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016515571686378792
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.019362442756114775,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014460076587289445
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.0955827666174569,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0038147197500493523
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.08729998313244142,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003273426860969895
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.07523875031649757,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0027701164872551303
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.09853735471232261,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003943435886305644
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.08851997873549314,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003306265435627653
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.0769072306358556,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002823987889826904
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..115bcf0683509690a92c0871f83505ce9688a011
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166526473007815
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166526473007815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d121b8acca9d28d9455d9ef22fb1b53c2cd1c490
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5032644178454843,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665575530760367
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5032644178454843,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665575530760367
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5749767485b09c519a362ba57b69676ac626c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5108813928182807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663061261117746
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5108813928182807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663061261117746
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..84e2ccd48908074e59ca4dc7a2f291b6584cd3e6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5032644178454843,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166557553076037
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5032644178454843,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166557553076037
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1591f166b47b8b666756f02df226191a82010c6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5092491838955386,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663828032649181
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5092491838955386,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663828032649181
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ebc6fdf7bdf84c4428c1f7f6cf8b5260be5eb0d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.499455930359086,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665817258899177
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.499455930359086,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665817258899177
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..563d5c476eb7fe2b9e4c13d480297041ae2ee166
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.573993471164309,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01153737544851944
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.55930359085963,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01158347809065713
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfd7e7e7a32a7c83692a506539c47831dcb35e8b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5658324265505985,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011564264866016057
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5642002176278563,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011569259195486625
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7778953416d1bf492b9d61d983c35095d23d9ff7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5745375408052231,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011535468840824526
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5712731229597389,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011546694435712187
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bed43ce9ae88f3f186b7214ba11e4ade5d062908
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5560391730141458,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011592322161277832
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.558759521218716,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011584987344513572
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b21533aebbdf881cbc35e8d05741b3e4059b55c6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5533188248095756,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011599305042745072
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5549510337323177,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01159515750977576
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6aa15b45fcda410890f65ddbcaac90aa3abdc094
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5505984766050055,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011605936624156075
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5505984766050055,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011605936624156075
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b74afcf865a99c63e43af124a2cff8cceb4a373
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.573,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01564978964446222
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.5,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015819299929208316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d17cc1dcd32ac8d2a8c314ff326c9d4538e5ce86
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.651,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015080663991563097
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.624,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015325105508898134
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..caa365fc281a024017d0b848a09cd0e5c5de873d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.667,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.651,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.0150806639915631
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd5d8316735902a74b09f298f454969e51a7d2e5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.666,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014922019523732967
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.664,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014944140233795023
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa8f992f4052c93992e4c4c6bfe7edd01054e8ea
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.678,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014782913600996669
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.669,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01488827258820394
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..418fc40ce558b3de797b2e68c9a94c17a87cfd50
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.674,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01483050720454104
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.664,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014944140233795021
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..08dc922df5f244dc671273408253dfc6db25206d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.848,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011358918303475291
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.768,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.013354937452281572
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bad36d24d9bf23b18029133b05dcfc8fe7396147
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.895,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009698921026024942
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.889,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009938701010583726
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb13f341d05bc0c8975fc10b3efe8eacb8e14066
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.905,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009276910103103306
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.904,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.00932045443478322
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5675770afbe2634cc489ad06d7822af91e947bf
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.908,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009144376393151108
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.912,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008963053962592076
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dd749e31a9301eda1d30d69ace62343e2e6f074
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.912,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.00896305396259208
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.906,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.00923305200078773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b318b2615c8e136acbdc42c26a687e8586e39ff
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Direct-Question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.914,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008870325962594766
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.912,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008963053962592076
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1e6aaaa88bc6e3d06a8caf718160bd6396409b2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.346,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015050266127564445
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.35,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015090650341444231
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc6efd0ecf06af39d4d08806487745f14bdecb9b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.398,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015486634102858918
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.409,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015555094373257942
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2caa82810a5a7b02fe887cabb2ffce1323df7e1c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.374,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015308767369006366
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.384,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01538768276189707
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4e761efb6b5585ff77d43296450721636010099
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.405,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015531136990453045
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.416,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015594460144140607
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac131b8ba398ade71bab83f3ea5d2083d191597c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.395,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015466551464829347
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.4,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015499685165842594
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c18b44099cedd453bf3560125f0b11698295c05e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.4,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015499685165842592
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.412,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015572363292015097
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2beb56e7944fafce82f346f84e210405252208d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.401,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015506109745498318
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.388,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01541731797991108
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0d59ce7607314a393301556f0ecb28a7df3f442
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.357,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015158521721486773
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.372,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015292149942040577
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bbd803556f3b647fd619cacbaafc22c832420cf
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.377,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015333170125779847
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.396,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01547331326585941
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc525ab585f76acfd85b0bfbf058436d4c8d7e56
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.4,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015499685165842592
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.412,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015572363292015095
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d596cb7a683efaba02b238526b0398af426698e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.405,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015531136990453045
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.424,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015635487471405186
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..931fb7c26d20d0983c18dba5fa6fc0fc2df12353
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.399,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015493193313162908
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.417,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015599819048769618
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e55bff2fbcef1c224558006fc05ef8cab4b03d89
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.366,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015240612726405754
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.375,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015316971293620996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0faf1b0e2219f7e7b259ca2f92dcfc9d2d97f13e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.385,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01539519444541081
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.397,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015480007449307989
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae5ee5d51ce3aff4d492d19f98bd836d135653fc
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.373,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015300493622922809
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.397,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015480007449307982
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c3cad7bf5eebc2997d9429f3517cbb5dadcb889
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.411,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015566673418599275
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.413,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01557798682993653
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..63f587c131fa611ef65adc59e241fc5129c57b22
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.417,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015599819048769618
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.432,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015672320237336206
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f60ef545c79a769f6e4629a5805829b3fdf9a10
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.407,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015543249100255545
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.413,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015577986829936531
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..28ef9a397729ad779292e4101017382afdfdb47a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.5152324959914484,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011557065368348288
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.5344735435595938,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011534917341355127
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..326337a4694578a72147fb8917721eb3b7635c95
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.501336183858899,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01156239096465875
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.5093532870122929,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011560409019420374
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c11ef4402ad46bc3c01b05820d091ce9d937e58
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4730090860502405,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011545573278697239
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4740780331373597,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011546883081384893
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2decaac07237f91ee27598bad953e5efaf36490f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.46766435061464456,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011538227692217273
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.46392303580972744,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153229486915312
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..46c255249327346c3c25fad80cd8a93d04b69094
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4596472474612507,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011524715486240652
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011534056494505864
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..05af27593a926ab6a297da59b797cf5c594cf003
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.46018172100481025,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011525709570367509
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4623196151790486,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011529552555884573
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dac27c6358aa5c68bde979e5bffb179f5db819b6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.5163014430785676,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011556285484521566
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.5291288081239979,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011542794417345719
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c9239f7d49ecf4858880334a9d6015e51f2585f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.48850881881346875,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011559378273599121
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.5093532870122929,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011560409019420372
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e7573f9eb6a2cfd45458ac49f9e641f3ca3eb26
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4692677712453234,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011540570846495546
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.46873329770176375,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153980308563773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dbadeccf6ec218a530c8b623203271547488e64
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4564404061998931,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011518470676766509
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4537680384820951,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01151289919986303
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..10fb42ed0861c4c0b1940d2add362543a29546df
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4575093532870123,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011520605695184078
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.45323356493853556,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011511744771088355
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f91e34ff2b166f741c264c447ddde89172ffb27
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.46125066809192944,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011527657726586461
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4575093532870123,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01152060569518408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..518831f64f5f1d125ee19db9c5337a55c4fc745e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..723b3b831431fc85c60831effcef86ed1f2a2bba
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ad009fbe8680a1716c2669dcb330c04cf526fc
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..281351279c9b6b4105b90589ae40c55b877fb9cd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..745b4746341ddca746579518854dffa143f38b40
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a12305462884b2dad98f893a3a196970d1bc7ee
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..30b67442804f57768a8796d7dcd9705bce0b4fb7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.5173703901656868,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011555452669106634
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.5243185462319615,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011548748301487312
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bff08a6916d1fad1118f2cae3be3aac04c4e48b5
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.49545697487974344,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011561954965856516
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.5141635489043292,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011557792331301676
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..76a87e0c9cdc9abd176db129ba69d9f9d74ad55c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.46980224478888294,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011541325320336615
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.47835382148583644,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011551591851683338
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0713272654850c8ee930678314b6e61496cdc6a6
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4778193479422769,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011551049647290314
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.46766435061464456,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011538227692217271
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a527afaf9c7361f903e57fc6b13544998e7c89cb
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.45537145911277394,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011516282203726655
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.46125066809192944,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011527657726586463
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f55d35622647154684eb46740f96c42d438e5432
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.46392303580972744,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01153229486915312
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4575093532870123,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011520605695184075
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a99370d1ce1c022a6253f000ba3cfde43914d6ea
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.5291288081239979,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011542794417345719
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.5403527525387494,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011524715486240658
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2efed46a66d9c0c68271cf6c96dc2e83545d71f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4911811865312667,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01156063365695297
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.49599144842330306,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011562060664045738
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b46758060c64faa0391772c2aae168e2d1bc3f5e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4633885622661678,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011531394084549621
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4665954035275254,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011536599118298163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..442b029bd7f77ebca9383657d02d90e1e9a421e3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4660609299839658,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011535764881641411
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.46392303580972744,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011532294869153118
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c734d581652d1ebc33a8dc72727a53adf6bf7c7
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011534056494505866
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.46178514163548906,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011528611805439891
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..21340f5802c25a98edac9b6ebe4773355ff4c058
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4665954035275254,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011536599118298171
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011534056494505864
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7b0c14e5cc4345985fc7b90ba8eff428a10ea5f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03006330041190266
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030025579819366426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b179a31437e46e4fbc94eb9037c5de843804c7d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5090252707581228,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331327
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b863a84f50e94352be07404fbe350f3aa40d3698
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03003973059219781
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03003973059219781
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5e329f77cf5bc160301fc1cc8ca332f8ce3265a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4acc202b029a3f947c257d7cb9325f73ef6144a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976626
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..17021c4df336aa2fe97d2ea672ca5901ac14344b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.4620938628158845,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030009848912529117
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4ac5d68f7b1e53a9f462d68aa91d1d8950babb0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030009848912529113
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c40cded477fbdd76cd1b6cb0bd3417ac641454a0
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdbd7f70cc6999a47acf1d0cd6be9bf7bdaa142b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc44141ef3b382db5830480072690eb06dc19d18
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976626
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2f16f6da7a0df17e64a8ce95f30940f5a3d44ee
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4548736462093863,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029973636495415252
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.4729241877256318,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d16b3137a53bec1f0cebffbc7d6a197f116a90d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.44404332129963897,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.02990739633379599
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.4620938628158845,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030009848912529117
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2288198894f06c950c4710e5017fa833af5a746
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030039730592197816
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d0112455471d002ee5db560fa5226eadcb07e07
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddb43ad03e1afe1143f6d95e1dd49a969de03377
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.47653429602888087,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03006330041190266
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc45ba96ab688e2be60c836d806c5d9d1baadffe
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.48014440433212996,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.0300727231673172
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5090252707581228,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..92a61dd8b0ceef5824fc416af3062d2b1d18ce43
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.44404332129963897,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029907396333795994
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.4657039711191336,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03002557981936642
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fe88b10b29195b73bc1f2cf3c50f8072a75a468
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.4404332129963899,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029882123363118726
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.44765342960288806,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.02993107036293953
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a93717ba096d2229471356b354218687fbd5fc80
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..63fa14b763a9a592a21171f5ee06c32d0ce46cca
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..475c79b521be7958b35c403e17a3c21c325df319
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5090252707581228,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5723783beb409cb1de80aeb3fcc222936cc5ece1
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e656337561d9735a59ad254d840ac4745f49fd44
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.4657039711191336,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03002557981936642
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331327
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f995ec97514eeeab4efe4e8633c0d253580d7712
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.4223826714801444,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029731622646495887
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.44404332129963897,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029907396333795997
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..663d9fac7868a0ac3c7f1276932f60b87c868869
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030025579819366426
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..716cf4728d7cec6da999b12628e37261b5ef4615
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e45acba2936c10735388e735630fb47ab5e998d4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317177
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030039730592197812
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7c9ec32f67552852c6fccbe6257528890403f0b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c04410c144d92b12073ee2d27070f415c7f5233a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..93501d0b9cbed7a3877d42a51923b26d890f4c5b
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.4729241877256318,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5090252707581228,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd38b4c6fd42da156e2fd5cf6173092ad936ac7e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978601
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.500394632991318,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b36dade00858aab97c5ababcdb79f5b38c151b55
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5295974743488555,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014027843827840083
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5248618784530387,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01403510288362775
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a62390768848888c1c6712730713bbccaca01af1
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5224940805051302,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014038257824059885
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783665
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b55b48fa51f412ccf17da8b512fa4e73541dc07e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5217048145224941,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01403923921648463
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01405090552122858
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9c2a4d1f01e78231c4d0c446b277eaeee5d82bd
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5146014206787688,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014046492383275835
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..588b6d41eb1f8182ab2a251ecd136a2f7854bec4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_Replace_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404827882040562
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..25b29f8d2d62ea2a325ec64d38bf72080b10d0e3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076896
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051956064076896
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ff9b42725b81b218940c7ea6f28d906daca5386
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050555322824189
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225629
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a892174109f6c7bc2e6461b92d04d652c2a9b6d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051500838485807
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049294536290396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddb96a59f1ab252a502b16c8dabb41b00d2481e3
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047718393997663
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047718393997663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b985bfc222b109a21d40fae28065f41451307626
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5098658247829518,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049749833367596
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050905521228573
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c4eea187624b001065b516ea19100646d7721c4
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_True-or-False_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5185477505919495,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014042813708888378
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5240726124704025,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014036189665395132
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a49a51a48a9046e192e69d5d2f92bfafdddcd64
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.49329123914759276,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051220692330349
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.47987371744277824,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014041096664344324
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2d0d90bce841e6a8b072fd6161036a328d32065
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616441
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4980268350434096,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225629
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6f9800cdc41eec24c84e40ccbde343eaf6eb6f9
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.4877663772691397,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014048278820405612
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.49013417521704816,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049749833367585
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..767bc03c2153eb7f160d4d8425768f19bbcda04c
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140505553228242
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4964483030781373,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052131146915845
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a34b30d2e75490d85743305724dc317af10370f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.505130228887135,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051745961790516
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616441
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d88a0aec034b6bdb38a25bcfd2e6aa4f6563c7e8
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.500394632991318,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052481306049512
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.500394632991318,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049512
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f47e186988ed507f297215120663fbcfb33dec19
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5209155485398579,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404018549421295
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049512
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f36820ce6a34b3c7df3851c74aa28dda557b1a0d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047718393997663
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.4972375690607735,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..71f361be039428682ca8a92837119cbc34582318
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050555322824194
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.0140492945362904
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..708197309c0b9a740a367dcab0f6e22edde22aa2
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978596
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047718393997663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..12b6106a8a0d12736a74581ebc6300e78ac8538e
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978594
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5177584846093133,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014043619596174966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bbd879c535a6920f960a0f85f7c252d0e27d00d
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_stand-for_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5209155485398579,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014040185494212945
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052131146915867
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_0.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc8017d706a51c4a6d355b8ebe7a267e4433b664
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5090765588003157,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405017009449771
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051956064076906
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_1.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1887fb4500aeb006c98ec1e4111dfb0e9e861e00
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.4861878453038674,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047122916440419
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.4696132596685083,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014026510839428744
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_2.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..30001299e09d68a2f0f7f3b92a685abb25d12d45
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.48697711128650356,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047718393997674
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.48855564325177586,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014048804199859329
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_3.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5907d889c023eab8e8e6bf7f38841c951c54af85
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.505130228887135,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051745961790516
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.4972375690607735,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616436
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_4.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c9d77d521a932c5973a49b1c0f6c9421a29183a
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.505130228887135,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051745961790516
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050555322824194
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_5.json b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..73a3c03919807aca3e0deb0eaea6a15e8c64509f
--- /dev/null
+++ b/4b284b21boscar/eval/slim.4b284b21boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.4972375690607735,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616436
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.49171270718232046,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050555322824189
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/merged.csv b/4b284b21boscar/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..3f74844d90af4c3342c7cbc6081c380409e06577
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/merged.csv
@@ -0,0 +1,53 @@
+dataset,fewshots,prompt,metric,value
+e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0834337726594754
+e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0834337726594754
+e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.22091389030064756
+e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.22091389030064756
+e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.24745510003725912
+e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.24745510003725912
+e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2531409526666904
+e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2531409526666904
+e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2547260269260162
+e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2547260269260162
+e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.25642542561601567
+e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.25642542561601567
+e2e_nlg_cleaned,5,average,multiple,0.21934919470101738
+gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04697902966116714
+gem_xsum,0,median,rouge2_fmeasure,0.04697902966116714
+gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04454429056503735
+gem_xsum,1,median,rouge2_fmeasure,0.04454429056503735
+gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05354419366516433
+gem_xsum,2,median,rouge2_fmeasure,0.05354419366516433
+gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05453694888387553
+gem_xsum,3,median,rouge2_fmeasure,0.05453694888387553
+gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013296321991506897
+gem_xsum,4,median,rouge2_fmeasure,0.013296321991506897
+gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00018583265458717857
+gem_xsum,5,median,rouge2_fmeasure,0.00018583265458717857
+gem_xsum,5,average,multiple,0.03551443623688974
+web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04529204402030828
+web_nlg_en,0,median,rouge2_fmeasure,0.04529204402030828
+web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08499563989738917
+web_nlg_en,1,median,rouge2_fmeasure,0.08499563989738917
+web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10954670655177427
+web_nlg_en,2,median,rouge2_fmeasure,0.10954670655177427
+web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11757388147146428
+web_nlg_en,3,median,rouge2_fmeasure,0.11757388147146428
+web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.12321787830184827
+web_nlg_en,4,median,rouge2_fmeasure,0.12321787830184827
+web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.13531236031865498
+web_nlg_en,5,median,rouge2_fmeasure,0.13531236031865498
+web_nlg_en,5,average,multiple,0.10265641842690654
+wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.033116246346807675
+wiki_lingua_en,0,median,rouge2_fmeasure,0.033116246346807675
+wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04555610212149096
+wiki_lingua_en,1,median,rouge2_fmeasure,0.04555610212149096
+wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06387789794595161
+wiki_lingua_en,2,median,rouge2_fmeasure,0.06387789794595161
+wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.054477366161976716
+wiki_lingua_en,3,median,rouge2_fmeasure,0.054477366161976716
+wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01686088972765567
+wiki_lingua_en,4,median,rouge2_fmeasure,0.01686088972765567
+wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0029391304431817997
+wiki_lingua_en,5,median,rouge2_fmeasure,0.0029391304431817997
+wiki_lingua_en,5,average,multiple,0.0361379387911774
diff --git a/4b284b21boscar/evaluation/generation/merged.json b/4b284b21boscar/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1822d16c1cedddbe350fdbcf5a28ea680ef3248
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3119407628991087, "bleu_stderr": 0.02839548709754242, "rouge1_fmeasure": 0.0990806449013871, "rouge1_fmeasure_stderr": 0.0021415843483912466, "rouge1_precision": 0.07438946576288918, "rouge1_precision_stderr": 0.0025513647065143596, "rouge1_recall": 0.2632468762241862, "rouge1_recall_stderr": 0.004868249221352049, "rouge2_fmeasure": 0.04529204402030828, "rouge2_fmeasure_stderr": 0.001254298305442429, "rouge2_precision": 0.03312917849091313, "rouge2_precision_stderr": 0.0014174063689487894, "rouge2_recall": 0.12235027642168965, "rouge2_recall_stderr": 0.0031144296943736486, "rougeL_fmeasure": 0.09534482558563832, "rougeL_fmeasure_stderr": 0.0019930933840825046, "rougeL_precision": 0.07117230529834999, "rougeL_precision_stderr": 0.00236804160887153, "rougeL_recall": 0.25570352079335334, "rougeL_recall_stderr": 0.004721463101193061, "rougeLsum_fmeasure": 0.09431820077676166, "rougeLsum_fmeasure_stderr": 0.0020087380246042903, "rougeLsum_precision": 0.07060394534578683, "rougeLsum_precision_stderr": 0.0023788303493259178, "rougeLsum_recall": 0.25086964131360756, "rougeLsum_recall_stderr": 0.00462453643406655}}, "1": {"PALM_prompt": {"bleu": 0.5375007952609668, "bleu_stderr": 0.027971198631201743, "rouge1_fmeasure": 0.16868686014694956, "rouge1_fmeasure_stderr": 0.0037497452245084024, "rouge1_precision": 0.1520773825012676, "rouge1_precision_stderr": 0.004748290705934928, "rouge1_recall": 0.3227263264666382, "rouge1_recall_stderr": 0.004944101731638509, "rouge2_fmeasure": 0.08499563989738917, "rouge2_fmeasure_stderr": 0.002483035814872989, "rouge2_precision": 0.07832284618044694, "rouge2_precision_stderr": 0.0031613171494569007, "rouge2_recall": 0.1641704139450893, "rouge2_recall_stderr": 0.003490737203218088, "rougeL_fmeasure": 0.1534335691189539, "rougeL_fmeasure_stderr": 0.0032520482964081736, "rougeL_precision": 0.13745282575305529, "rougeL_precision_stderr": 0.004251980584200159, "rougeL_recall": 0.3019875028781424, "rougeL_recall_stderr": 0.004563649108443115, "rougeLsum_fmeasure": 0.15571056260264077, "rougeLsum_fmeasure_stderr": 0.0033278332133086664, "rougeLsum_precision": 0.13987460558088444, "rougeLsum_precision_stderr": 0.00433835517591582, "rougeLsum_recall": 0.3044600652888652, "rougeLsum_recall_stderr": 0.004600132970650535}}, "2": {"PALM_prompt": {"bleu": 0.7869498474646944, "bleu_stderr": 0.04640396849840582, "rouge1_fmeasure": 0.2084575980044784, "rouge1_fmeasure_stderr": 0.004415820801430582, "rouge1_precision": 0.19796396527381735, "rouge1_precision_stderr": 0.005701278438899864, "rouge1_recall": 0.3706483303216963, "rouge1_recall_stderr": 0.0050173611899764405, "rouge2_fmeasure": 0.10954670655177427, "rouge2_fmeasure_stderr": 0.0031308589770994762, "rouge2_precision": 0.10467370587595308, "rouge2_precision_stderr": 0.003819065495086435, "rouge2_recall": 0.19711908309071768, "rouge2_recall_stderr": 0.0038874252766392605, "rougeL_fmeasure": 0.18766755781965408, "rougeL_fmeasure_stderr": 0.0038435725948238425, "rougeL_precision": 0.17592108046935254, "rougeL_precision_stderr": 0.005006735502669654, "rougeL_recall": 0.3446684722385714, "rougeL_recall_stderr": 0.004645081391462266, "rougeLsum_fmeasure": 0.19187903453194924, "rougeLsum_fmeasure_stderr": 0.003958689064769724, "rougeLsum_precision": 0.18060870457183115, "rougeLsum_precision_stderr": 0.005158605822048483, "rougeLsum_recall": 0.3494009114809856, "rougeLsum_recall_stderr": 0.004713912149914969}}, "3": {"PALM_prompt": {"bleu": 0.8781645295094662, "bleu_stderr": 0.023295523644902574, "rouge1_fmeasure": 0.22142348005369172, "rouge1_fmeasure_stderr": 0.004576657734814378, "rouge1_precision": 0.20253583048997179, "rouge1_precision_stderr": 0.005667155356549766, "rouge1_recall": 0.39458836927395, "rouge1_recall_stderr": 0.00508328824891432, "rouge2_fmeasure": 0.11757388147146428, "rouge2_fmeasure_stderr": 0.003185839232819997, "rouge2_precision": 0.11025140845494157, "rouge2_precision_stderr": 0.003876483056835859, "rouge2_recall": 0.21020652499163808, "rouge2_recall_stderr": 0.003888206554440605, "rougeL_fmeasure": 0.19753768298479737, "rougeL_fmeasure_stderr": 0.003907531687545351, "rougeL_precision": 0.17821674790607994, "rougeL_precision_stderr": 0.004906399590419944, "rougeL_recall": 0.3645388282549846, "rougeL_recall_stderr": 0.004646105275227094, "rougeLsum_fmeasure": 0.20241133896176328, "rougeLsum_fmeasure_stderr": 0.004042906289046454, "rougeLsum_precision": 0.18364695968481887, "rougeLsum_precision_stderr": 0.005076299243812921, "rougeLsum_recall": 0.3699350432753926, "rougeLsum_recall_stderr": 0.004721581387764664}}, "4": {"PALM_prompt": {"bleu": 0.9900707197163517, "bleu_stderr": 0.031011845880251587, "rouge1_fmeasure": 0.231391578826814, "rouge1_fmeasure_stderr": 0.004602731955828627, "rouge1_precision": 0.21624013657641925, "rouge1_precision_stderr": 0.0057958787897655845, "rouge1_recall": 0.4006689051971167, "rouge1_recall_stderr": 0.005025377089358478, "rouge2_fmeasure": 0.12321787830184827, "rouge2_fmeasure_stderr": 0.0031814702327693906, "rouge2_precision": 0.11741133357899738, "rouge2_precision_stderr": 0.0038707531452077852, "rouge2_recall": 0.21636504595775147, "rouge2_recall_stderr": 0.003993049382207552, "rougeL_fmeasure": 0.20337113659463418, "rougeL_fmeasure_stderr": 0.003830528444749543, "rougeL_precision": 0.1866671922061064, "rougeL_precision_stderr": 0.004886171018823189, "rougeL_recall": 0.3671506160865042, "rougeL_recall_stderr": 0.004552611209109354, "rougeLsum_fmeasure": 0.2102687170624152, "rougeLsum_fmeasure_stderr": 0.0040377255274080814, "rougeLsum_precision": 0.19479702587195674, "rougeLsum_precision_stderr": 0.00516733338200102, "rougeLsum_recall": 0.3738824304063063, "rougeLsum_recall_stderr": 0.004627178059919308}}, "5": {"PALM_prompt": {"bleu": 1.1661469259713872, "bleu_stderr": 0.05442011585382653, "rouge1_fmeasure": 0.2462703929154201, "rouge1_fmeasure_stderr": 0.004784081954132341, "rouge1_precision": 0.23542620246457463, "rouge1_precision_stderr": 0.006120612746647669, "rouge1_recall": 0.4126382803733479, "rouge1_recall_stderr": 0.004969059062845303, "rouge2_fmeasure": 0.13531236031865498, "rouge2_fmeasure_stderr": 0.003419155242253958, "rouge2_precision": 0.13414542688077327, "rouge2_precision_stderr": 0.004317950948756051, "rouge2_recall": 0.22644337464507955, "rouge2_recall_stderr": 0.00400180902077526, "rougeL_fmeasure": 0.21699579683860706, "rougeL_fmeasure_stderr": 0.004030577049307887, "rougeL_precision": 0.20531156621874325, "rougeL_precision_stderr": 0.005277463604539717, "rougeL_recall": 0.3773305254767969, "rougeL_recall_stderr": 0.004505345415815039, "rougeLsum_fmeasure": 0.22427477226716616, "rougeLsum_fmeasure_stderr": 0.004231158968354307, "rougeLsum_precision": 0.21360232440350232, "rougeLsum_precision_stderr": 0.005527478664092985, "rougeLsum_recall": 0.38494058675880927, "rougeLsum_recall_stderr": 0.004590859610063705}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.744610721855112, "bleu_stderr": 0.16526552066121028, "rouge1_fmeasure": 0.13282629414914784, "rouge1_fmeasure_stderr": 0.0025194987202168846, "rouge1_precision": 0.13048152009975786, "rouge1_precision_stderr": 0.0029945327745025297, "rouge1_recall": 0.17865577403787464, "rouge1_recall_stderr": 0.0033881835064667153, "rouge2_fmeasure": 0.033116246346807675, "rouge2_fmeasure_stderr": 0.0010039942081565677, "rouge2_precision": 0.03018125254771904, "rouge2_precision_stderr": 0.0009701597241428515, "rouge2_recall": 0.044665240253023694, "rouge2_recall_stderr": 0.001413086106707342, "rougeL_fmeasure": 0.10296911367521405, "rougeL_fmeasure_stderr": 0.0018747375341668269, "rougeL_precision": 0.10254296237637063, "rougeL_precision_stderr": 0.0024976516791238405, "rougeL_recall": 0.1412543271300389, "rougeL_recall_stderr": 0.0026741023650489657, "rougeLsum_fmeasure": 0.12355432651994205, "rougeLsum_fmeasure_stderr": 0.0023550817743274908, "rougeLsum_precision": 0.12215023147917263, "rougeLsum_precision_stderr": 0.0028728571931844558, "rougeLsum_recall": 0.1662465685102492, "rougeLsum_recall_stderr": 0.0031683211934688273}}, "1": {"tldr_en": {"bleu": 2.8487850954748324, "bleu_stderr": 0.07399043126258031, "rouge1_fmeasure": 0.18959676949532062, "rouge1_fmeasure_stderr": 0.0022208786444120037, "rouge1_precision": 0.24565174971014234, "rouge1_precision_stderr": 0.0037427166308021094, "rouge1_recall": 0.21184490359364247, "rouge1_recall_stderr": 0.0027978242331130445, "rouge2_fmeasure": 0.04555610212149096, "rouge2_fmeasure_stderr": 0.0012842639383539272, "rouge2_precision": 0.064292697870659, "rouge2_precision_stderr": 0.00223313142118904, "rouge2_recall": 0.05044989353111514, "rouge2_recall_stderr": 0.00149356139860852, "rougeL_fmeasure": 0.14459077408940324, "rougeL_fmeasure_stderr": 0.0016809419269475205, "rougeL_precision": 0.19090785019208276, "rougeL_precision_stderr": 0.0031096615547551243, "rougeL_recall": 0.16233189417978816, "rougeL_recall_stderr": 0.0021666821757260154, "rougeLsum_fmeasure": 0.1779721097148494, "rougeLsum_fmeasure_stderr": 0.002089075825231665, "rougeLsum_precision": 0.23112890188861118, "rougeLsum_precision_stderr": 0.003571687360942092, "rougeLsum_recall": 0.19926382429471512, "rougeLsum_recall_stderr": 0.0026476455852546383}}, "2": {"tldr_en": {"bleu": 3.7186948773008828, "bleu_stderr": 0.05546113136172655, "rouge1_fmeasure": 0.2255393163759458, "rouge1_fmeasure_stderr": 0.002203135213246267, "rouge1_precision": 0.3177459530757474, "rouge1_precision_stderr": 0.00400211201809437, "rouge1_recall": 0.23523949924863127, "rouge1_recall_stderr": 0.0027923501651691642, "rouge2_fmeasure": 0.06387789794595161, "rouge2_fmeasure_stderr": 0.0013813734339407, "rouge2_precision": 0.09785647917944716, "rouge2_precision_stderr": 0.002624157152992091, "rouge2_recall": 0.06533504155625053, "rouge2_recall_stderr": 0.00156036864094978, "rougeL_fmeasure": 0.17360298467727694, "rougeL_fmeasure_stderr": 0.0017280099368334454, "rougeL_precision": 0.24936441373065246, "rougeL_precision_stderr": 0.0034018286291394332, "rougeL_recall": 0.18082782092908864, "rougeL_recall_stderr": 0.002197669620317046, "rougeLsum_fmeasure": 0.21263985026820842, "rougeLsum_fmeasure_stderr": 0.0020905527432427407, "rougeLsum_precision": 0.30084337998110033, "rougeLsum_precision_stderr": 0.0038595850362637936, "rougeLsum_recall": 0.22147712923465016, "rougeLsum_recall_stderr": 0.0026291853666447025}}, "3": {"tldr_en": {"bleu": 2.687306446962641, "bleu_stderr": 0.0664578324203411, "rouge1_fmeasure": 0.19252198096891418, "rouge1_fmeasure_stderr": 0.0025594189650328643, "rouge1_precision": 0.2846746242566653, "rouge1_precision_stderr": 0.004319925738331281, "rouge1_recall": 0.1938876931146627, "rouge1_recall_stderr": 0.0030463601923653946, "rouge2_fmeasure": 0.054477366161976716, "rouge2_fmeasure_stderr": 0.0013706432857531726, "rouge2_precision": 0.086213919205008, "rouge2_precision_stderr": 0.0025532621303566427, "rouge2_recall": 0.05487592547354139, "rouge2_recall_stderr": 0.0015509508090563054, "rougeL_fmeasure": 0.1494133193527796, "rougeL_fmeasure_stderr": 0.0020035425127221443, "rougeL_precision": 0.22546828575157715, "rougeL_precision_stderr": 0.0036375567676432637, "rougeL_recall": 0.15015796002510307, "rougeL_recall_stderr": 0.00240340751959009, "rougeLsum_fmeasure": 0.18112869368220622, "rougeLsum_fmeasure_stderr": 0.002417009296022553, "rougeLsum_precision": 0.2694034648043568, "rougeLsum_precision_stderr": 0.004158098981958554, "rougeLsum_recall": 0.1821584435675664, "rougeLsum_recall_stderr": 0.0028693565561164385}}, "4": {"tldr_en": {"bleu": 0.04668959419048288, "bleu_stderr": 0.007153039873759097, "rouge1_fmeasure": 0.06084731695924829, "rouge1_fmeasure_stderr": 0.002176933000207993, "rouge1_precision": 0.0960156493415235, "rouge1_precision_stderr": 0.003616614563173548, "rouge1_recall": 0.06015169027512941, "rouge1_recall_stderr": 0.0023503642149532013, "rouge2_fmeasure": 0.01686088972765567, "rouge2_fmeasure_stderr": 0.0009404003204068355, "rouge2_precision": 0.02926487891196175, "rouge2_precision_stderr": 0.0018445908136531816, "rouge2_recall": 0.016390097756710017, "rouge2_recall_stderr": 0.0010106857547182186, "rougeL_fmeasure": 0.048152202763597045, "rougeL_fmeasure_stderr": 0.0017262084858342547, "rougeL_precision": 0.07801821469361046, "rougeL_precision_stderr": 0.00303848388489472, "rougeL_recall": 0.04729130821614451, "rougeL_recall_stderr": 0.0018488779124782556, "rougeLsum_fmeasure": 0.05696906006963692, "rougeLsum_fmeasure_stderr": 0.0020412224032817106, "rougeLsum_precision": 0.09073940139372572, "rougeLsum_precision_stderr": 0.0034537583670396933, "rougeLsum_recall": 0.05607109134810165, "rougeLsum_recall_stderr": 0.0021842218596914128}}, "5": {"tldr_en": {"bleu": 1.2603158162943564e-14, "bleu_stderr": 6.751733488101547e-13, "rouge1_fmeasure": 0.01017010786697453, "rouge1_fmeasure_stderr": 0.0009828813757946661, "rouge1_precision": 0.016572326495735878, "rouge1_precision_stderr": 0.0017108845418978154, "rouge1_recall": 0.009927306068576757, "rouge1_recall_stderr": 0.0010319824087958868, "rouge2_fmeasure": 0.0029391304431817997, "rouge2_fmeasure_stderr": 0.0004163042405148408, "rouge2_precision": 0.005362446572675429, "rouge2_precision_stderr": 0.0008727663926039747, "rouge2_recall": 0.002846016962133478, "rouge2_recall_stderr": 0.0004240233213271509, "rougeL_fmeasure": 0.008341512107444696, "rougeL_fmeasure_stderr": 0.000825303231689821, "rougeL_precision": 0.014042125577094597, "rougeL_precision_stderr": 0.0015259273674767392, "rougeL_recall": 0.008096887154439805, "rougeL_recall_stderr": 0.000857985946484465, "rougeLsum_fmeasure": 0.009670630558683124, "rougeLsum_fmeasure_stderr": 0.0009355088060122658, "rougeLsum_precision": 0.015986982643131028, "rougeLsum_precision_stderr": 0.0016701967608221248, "rougeLsum_recall": 0.009428039452194052, "rougeLsum_recall_stderr": 0.0009838146973353556}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 8.153800130249518, "bleu_stderr": 0.19718858686762447, "rouge1_fmeasure": 0.16625763147318187, "rouge1_fmeasure_stderr": 0.0036077916351727156, "rouge1_precision": 0.14254055468502322, "rouge1_precision_stderr": 0.0032976710551722, "rouge1_recall": 0.21985285950008995, "rouge1_recall_stderr": 0.004764464623700324, "rouge2_fmeasure": 0.0834337726594754, "rouge2_fmeasure_stderr": 0.0019832478132023163, "rouge2_precision": 0.07117779808817919, "rouge2_precision_stderr": 0.0018071269269227326, "rouge2_recall": 0.11165748130728335, "rouge2_recall_stderr": 0.0026568526220331497, "rougeL_fmeasure": 0.13948285594182427, "rougeL_fmeasure_stderr": 0.003021076305392557, "rougeL_precision": 0.11919497082898337, "rougeL_precision_stderr": 0.0027419572678360447, "rougeL_recall": 0.18525665107433087, "rougeL_recall_stderr": 0.0040358655783698395, "rougeLsum_fmeasure": 0.1461396427981472, "rougeLsum_fmeasure_stderr": 0.003226960516681579, "rougeLsum_precision": 0.12480961931955924, "rougeLsum_precision_stderr": 0.002918263840706436, "rougeLsum_recall": 0.19394917883355808, "rougeLsum_recall_stderr": 0.004289318267212664}}, "1": {"generate_text_restaurant": {"bleu": 11.82816578683988, "bleu_stderr": 0.10613794908694005, "rouge1_fmeasure": 0.4695495704689584, "rouge1_fmeasure_stderr": 0.002364549723893871, "rouge1_precision": 0.5741208333690715, "rouge1_precision_stderr": 0.0032791825020937275, "rouge1_recall": 0.43616299285547877, "rouge1_recall_stderr": 0.0029918336223318646, "rouge2_fmeasure": 0.22091389030064756, "rouge2_fmeasure_stderr": 0.0020601818538796947, "rouge2_precision": 0.27417940276408975, "rouge2_precision_stderr": 0.002745882165587474, "rouge2_recall": 0.2046937372687625, "rouge2_recall_stderr": 0.00215463330838664, "rougeL_fmeasure": 0.3383298269843499, "rougeL_fmeasure_stderr": 0.002100531054867843, "rougeL_precision": 0.4168038088426164, "rougeL_precision_stderr": 0.0029886435691974275, "rougeL_recall": 0.3132001002805105, "rougeL_recall_stderr": 0.0024293669614656285, "rougeLsum_fmeasure": 0.38079591964268383, "rougeLsum_fmeasure_stderr": 0.00234065536986238, "rougeLsum_precision": 0.4669926334976919, "rougeLsum_precision_stderr": 0.0032045235446491534, "rougeLsum_recall": 0.3531706097054418, "rougeLsum_recall_stderr": 0.0027268177176508718}}, "2": {"generate_text_restaurant": {"bleu": 14.210994141328136, "bleu_stderr": 0.14395128449450548, "rouge1_fmeasure": 0.4992077922612147, "rouge1_fmeasure_stderr": 0.0023268998699713796, "rouge1_precision": 0.5957938406647506, "rouge1_precision_stderr": 0.0032189851910091535, "rouge1_recall": 0.4666858600777237, "rouge1_recall_stderr": 0.0029439118893739937, "rouge2_fmeasure": 0.24745510003725912, "rouge2_fmeasure_stderr": 0.0021577622824456, "rouge2_precision": 0.29925246097428815, "rouge2_precision_stderr": 0.0028135040853843815, "rouge2_recall": 0.2311260083068878, "rouge2_recall_stderr": 0.0022842743345053117, "rougeL_fmeasure": 0.36403535470109827, "rougeL_fmeasure_stderr": 0.002158659011059573, "rougeL_precision": 0.43644170204107635, "rougeL_precision_stderr": 0.0030000260097151686, "rougeL_recall": 0.339812800549829, "rougeL_recall_stderr": 0.0024913510362416656, "rougeLsum_fmeasure": 0.4134385994397868, "rougeLsum_fmeasure_stderr": 0.002387535627435029, "rougeLsum_precision": 0.4937561615312527, "rougeLsum_precision_stderr": 0.0031994326527389665, "rougeLsum_recall": 0.3863879597392378, "rougeLsum_recall_stderr": 0.002778922240841303}}, "3": {"generate_text_restaurant": {"bleu": 14.758419275441623, "bleu_stderr": 0.14846553640981838, "rouge1_fmeasure": 0.5048691947729075, "rouge1_fmeasure_stderr": 0.0022904626291377636, "rouge1_precision": 0.5969365877168988, "rouge1_precision_stderr": 0.003176934049191526, "rouge1_recall": 0.47321803649209376, "rouge1_recall_stderr": 0.0028949729129975907, "rouge2_fmeasure": 0.2531409526666904, "rouge2_fmeasure_stderr": 0.0021887540705134477, "rouge2_precision": 0.3021673329464843, "rouge2_precision_stderr": 0.002773447672105312, "rouge2_recall": 0.23749889827974513, "rouge2_recall_stderr": 0.0023473465141403524, "rougeL_fmeasure": 0.36861382037574736, "rougeL_fmeasure_stderr": 0.002208485032472364, "rougeL_precision": 0.43729156400302177, "rougeL_precision_stderr": 0.003007804981399626, "rougeL_recall": 0.34523784521876366, "rougeL_recall_stderr": 0.0025266731757362768, "rougeLsum_fmeasure": 0.4186873543720982, "rougeLsum_fmeasure_stderr": 0.0024264140364972063, "rougeLsum_precision": 0.4949310693592707, "rougeLsum_precision_stderr": 0.0031940660026111094, "rougeLsum_recall": 0.39247584122491685, "rougeLsum_recall_stderr": 0.0027974990919393808}}, "4": {"generate_text_restaurant": {"bleu": 14.9471441955308, "bleu_stderr": 0.20049087654668654, "rouge1_fmeasure": 0.5068439147529751, "rouge1_fmeasure_stderr": 0.0022702971756168133, "rouge1_precision": 0.5964162408645288, "rouge1_precision_stderr": 0.0031670174643345854, "rouge1_recall": 0.476041597124549, "rouge1_recall_stderr": 0.002870553977627495, "rouge2_fmeasure": 0.2547260269260162, "rouge2_fmeasure_stderr": 0.0021693766012080536, "rouge2_precision": 0.30318643893717323, "rouge2_precision_stderr": 0.0027688869090984037, "rouge2_recall": 0.2391458905396845, "rouge2_recall_stderr": 0.00231587546078518, "rougeL_fmeasure": 0.3700166988328032, "rougeL_fmeasure_stderr": 0.0022206494000581454, "rougeL_precision": 0.43667748453350236, "rougeL_precision_stderr": 0.0030068254256241543, "rougeL_recall": 0.3473159987716765, "rougeL_recall_stderr": 0.0025367602883456315, "rougeLsum_fmeasure": 0.4212470848260628, "rougeLsum_fmeasure_stderr": 0.0024487659358586895, "rougeLsum_precision": 0.495394213066299, "rougeLsum_precision_stderr": 0.0032160309119094407, "rougeLsum_recall": 0.3958929166908767, "rougeLsum_recall_stderr": 0.002827379825231143}}, "5": {"generate_text_restaurant": {"bleu": 15.12300988029553, "bleu_stderr": 0.27781770916239995, "rouge1_fmeasure": 0.5094338990091782, "rouge1_fmeasure_stderr": 0.0023180695212366305, "rouge1_precision": 0.5954584960080906, "rouge1_precision_stderr": 0.00325049366305589, "rouge1_recall": 0.47888879196863693, "rouge1_recall_stderr": 0.0028429629695053142, "rouge2_fmeasure": 0.25642542561601567, "rouge2_fmeasure_stderr": 0.0021909983236262304, "rouge2_precision": 0.30380630927932106, "rouge2_precision_stderr": 0.002835445529306852, "rouge2_recall": 0.2402751342075056, "rouge2_recall_stderr": 0.00226887033631254, "rougeL_fmeasure": 0.37228821193907236, "rougeL_fmeasure_stderr": 0.002208419488038826, "rougeL_precision": 0.4360299975558518, "rougeL_precision_stderr": 0.0029988845638560017, "rougeL_recall": 0.34982928906069083, "rougeL_recall_stderr": 0.002474832286550078, "rougeLsum_fmeasure": 0.4247238232753859, "rougeLsum_fmeasure_stderr": 0.0024321405233947197, "rougeLsum_precision": 0.4966033704013741, "rougeLsum_precision_stderr": 0.0032394543771446537, "rougeLsum_recall": 0.39901987621454466, "rougeLsum_recall_stderr": 0.0027430779669785923}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8671229080125915, "bleu_stderr": 0.08065653508825953, "rouge1_fmeasure": 0.1938850065877847, "rouge1_fmeasure_stderr": 0.00274725146396725, "rouge1_precision": 0.14108556952980922, "rouge1_precision_stderr": 0.0020956329882564906, "rouge1_recall": 0.32629804825161535, "rouge1_recall_stderr": 0.004756174570847187, "rouge2_fmeasure": 0.04697902966116714, "rouge2_fmeasure_stderr": 0.001582424189239984, "rouge2_precision": 0.03392541732989178, "rouge2_precision_stderr": 0.0011674749125079006, "rouge2_recall": 0.08137541100012088, "rouge2_recall_stderr": 0.00280912091771862, "rougeL_fmeasure": 0.14812739050025472, "rougeL_fmeasure_stderr": 0.002066618016210627, "rougeL_precision": 0.10765433273641108, "rougeL_precision_stderr": 0.0015745202745112603, "rougeL_recall": 0.25042455718394274, "rougeL_recall_stderr": 0.0036806317265347037, "rougeLsum_fmeasure": 0.15484724382764298, "rougeLsum_fmeasure_stderr": 0.0023502913687865295, "rougeLsum_precision": 0.11258249311328207, "rougeLsum_precision_stderr": 0.0017981489130128542, "rougeLsum_recall": 0.26182933742867265, "rougeLsum_recall_stderr": 0.004140212650666321}}, "1": {"article_DOC_summary": {"bleu": 1.7643751753804513, "bleu_stderr": 0.0800024640836125, "rouge1_fmeasure": 0.2072547434350325, "rouge1_fmeasure_stderr": 0.0029185590817686906, "rouge1_precision": 0.1818208364925118, "rouge1_precision_stderr": 0.0033811439409721453, "rouge1_recall": 0.2952342650035309, "rouge1_recall_stderr": 0.004323771519378331, "rouge2_fmeasure": 0.04454429056503735, "rouge2_fmeasure_stderr": 0.0017367902881785364, "rouge2_precision": 0.039255544280436615, "rouge2_precision_stderr": 0.0017970345435637744, "rouge2_recall": 0.06401578769657801, "rouge2_recall_stderr": 0.0024164697736714135, "rougeL_fmeasure": 0.15664138971417202, "rougeL_fmeasure_stderr": 0.002216089409249802, "rougeL_precision": 0.13745172437510383, "rougeL_precision_stderr": 0.0026008926362907463, "rougeL_recall": 0.22412058258654055, "rougeL_recall_stderr": 0.0033458865795404046, "rougeLsum_fmeasure": 0.1602204327991172, "rougeLsum_fmeasure_stderr": 0.0023319587866224892, "rougeLsum_precision": 0.1399352840204044, "rougeLsum_precision_stderr": 0.0026319234043525545, "rougeLsum_recall": 0.23050320162406268, "rougeLsum_recall_stderr": 0.0036405761255609054}}, "2": {"article_DOC_summary": {"bleu": 2.705819603984223, "bleu_stderr": 0.14137667865787865, "rouge1_fmeasure": 0.23644320153754822, "rouge1_fmeasure_stderr": 0.003260843172949545, "rouge1_precision": 0.2472505852052675, "rouge1_precision_stderr": 0.004009850648659755, "rouge1_recall": 0.2567441304949634, "rouge1_recall_stderr": 0.0038189444756239026, "rouge2_fmeasure": 0.05354419366516433, "rouge2_fmeasure_stderr": 0.002106078230291637, "rouge2_precision": 0.056866160489061936, "rouge2_precision_stderr": 0.002420240118413503, "rouge2_recall": 0.05783181540065835, "rouge2_recall_stderr": 0.0022927551175836464, "rougeL_fmeasure": 0.1770324612357477, "rougeL_fmeasure_stderr": 0.002634383182795547, "rougeL_precision": 0.18570698243499414, "rougeL_precision_stderr": 0.003303016306234649, "rougeL_recall": 0.19264012368473676, "rougeL_recall_stderr": 0.0030469236090590076, "rougeLsum_fmeasure": 0.17924796506451007, "rougeLsum_fmeasure_stderr": 0.0026665867584947088, "rougeLsum_precision": 0.18746784009032694, "rougeLsum_precision_stderr": 0.003301909604388057, "rougeLsum_recall": 0.196059514574862, "rougeLsum_recall_stderr": 0.003205680714038162}}, "3": {"article_DOC_summary": {"bleu": 3.1173819151981363, "bleu_stderr": 0.23709102615992358, "rouge1_fmeasure": 0.23375535546317172, "rouge1_fmeasure_stderr": 0.0036448754340216603, "rouge1_precision": 0.25083050440215976, "rouge1_precision_stderr": 0.004340710144695701, "rouge1_recall": 0.24313830962431818, "rouge1_recall_stderr": 0.003987068682586417, "rouge2_fmeasure": 0.05453694888387553, "rouge2_fmeasure_stderr": 0.0022209047576266894, "rouge2_precision": 0.058302376165758424, "rouge2_precision_stderr": 0.002462019491771468, "rouge2_recall": 0.056822850129455496, "rouge2_recall_stderr": 0.002365353936727513, "rougeL_fmeasure": 0.17375448749498826, "rougeL_fmeasure_stderr": 0.002893029439925038, "rougeL_precision": 0.1865779794802579, "rougeL_precision_stderr": 0.003475251260232787, "rougeL_recall": 0.1815458312356556, "rougeL_recall_stderr": 0.0031953873575430005, "rougeLsum_fmeasure": 0.17450339640702453, "rougeLsum_fmeasure_stderr": 0.002909832621947404, "rougeLsum_precision": 0.1872796185903397, "rougeLsum_precision_stderr": 0.0034817440259912887, "rougeLsum_recall": 0.18253965500420624, "rougeLsum_recall_stderr": 0.003256036671696088}}, "4": {"article_DOC_summary": {"bleu": 0.15136892255345546, "bleu_stderr": 0.037709514860432364, "rouge1_fmeasure": 0.05594753186729229, "rouge1_fmeasure_stderr": 0.0033676619143302084, "rouge1_precision": 0.06668389338088138, "rouge1_precision_stderr": 0.004287951717923472, "rouge1_recall": 0.05770356648149944, "rouge1_recall_stderr": 0.0035956507787910136, "rouge2_fmeasure": 0.013296321991506897, "rouge2_fmeasure_stderr": 0.0013642061125793434, "rouge2_precision": 0.014963307619343467, "rouge2_precision_stderr": 0.0016114455998580692, "rouge2_recall": 0.013741449593915791, "rouge2_recall_stderr": 0.0014260391208111062, "rougeL_fmeasure": 0.04310400237676907, "rougeL_fmeasure_stderr": 0.0026639130697933863, "rougeL_precision": 0.052652024080707335, "rougeL_precision_stderr": 0.003605493741125939, "rougeL_recall": 0.044415851405158295, "rougeL_recall_stderr": 0.0028463083421698503, "rougeLsum_fmeasure": 0.04383519297886595, "rougeLsum_fmeasure_stderr": 0.002700540506868496, "rougeLsum_precision": 0.053307788896004335, "rougeLsum_precision_stderr": 0.003626638986661626, "rougeLsum_recall": 0.04533527259366541, "rougeLsum_recall_stderr": 0.0029075081420929395}}, "5": {"article_DOC_summary": {"bleu": 4.919560458856041e-39, "bleu_stderr": 3.5763240226586303e-34, "rouge1_fmeasure": 0.0024320284759234414, "rouge1_fmeasure_stderr": 0.0006736354616386897, "rouge1_precision": 0.00273620898511371, "rouge1_precision_stderr": 0.0007798516293741871, "rouge1_recall": 0.002264558087005939, "rouge1_recall_stderr": 0.0006160940919096034, "rouge2_fmeasure": 0.00018583265458717857, "rouge2_fmeasure_stderr": 9.342731096527664e-05, "rouge2_precision": 0.0002196833564758093, "rouge2_precision_stderr": 0.00011015503412814092, "rouge2_recall": 0.00016199733180865256, "rouge2_recall_stderr": 8.184314454744501e-05, "rougeL_fmeasure": 0.0018932762031196894, "rougeL_fmeasure_stderr": 0.0005068809117616748, "rougeL_precision": 0.002119565890814211, "rougeL_precision_stderr": 0.0005829990564075634, "rougeL_recall": 0.001776402682704595, "rougeL_recall_stderr": 0.00046929937422078794, "rougeLsum_fmeasure": 0.0019743733562370912, "rougeLsum_fmeasure_stderr": 0.000543487074589739, "rougeLsum_precision": 0.00220984304166733, "rougeLsum_precision_stderr": 0.0006273058335995559, "rougeLsum_recall": 0.0018523030709016646, "rougeLsum_recall_stderr": 0.0005016261609450595}}}}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2b3dc60dc4c8acb7427a70c6f98a3fa157f8cf7
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.3119407628991087,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.02839548709754242
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.07438946576288918,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0025513647065143596
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.2632468762241862,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004868249221352049
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.0990806449013871,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021415843483912466
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.03312917849091313,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014174063689487894
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.12235027642168965,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0031144296943736486
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.04529204402030828,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001254298305442429
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.07117230529834999,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00236804160887153
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.25570352079335334,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004721463101193061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.09534482558563832,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019930933840825046
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.07060394534578683,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0023788303493259178
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.25086964131360756,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00462453643406655
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.09431820077676166,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020087380246042903
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ea12de9c5d31fad4ab5e2b83caa95bb54cb4fa2
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5375007952609668,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.027971198631201743
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1520773825012676,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004748290705934928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3227263264666382,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004944101731638509
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.16868686014694956,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0037497452245084024
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.07832284618044694,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0031613171494569007
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1641704139450893,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003490737203218088
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.08499563989738917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002483035814872989
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.13745282575305529,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004251980584200159
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3019875028781424,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004563649108443115
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1534335691189539,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0032520482964081736
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.13987460558088444,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00433835517591582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3044600652888652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004600132970650535
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.15571056260264077,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0033278332133086664
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..170f88a8a61391729e2728e3f6da663448f87836
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.7869498474646944,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04640396849840582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.19796396527381735,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005701278438899864
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3706483303216963,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0050173611899764405
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2084575980044784,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004415820801430582
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10467370587595308,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003819065495086435
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19711908309071768,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0038874252766392605
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10954670655177427,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0031308589770994762
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17592108046935254,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005006735502669654
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3446684722385714,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004645081391462266
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18766755781965408,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038435725948238425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.18060870457183115,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005158605822048483
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3494009114809856,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004713912149914969
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19187903453194924,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003958689064769724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d869466e5aee69b172a949faf832d41610d70f1
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.8781645295094662,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.023295523644902574
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.20253583048997179,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005667155356549766
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.39458836927395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00508328824891432
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.22142348005369172,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004576657734814378
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11025140845494157,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003876483056835859
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21020652499163808,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003888206554440605
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11757388147146428,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003185839232819997
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17821674790607994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004906399590419944
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3645388282549846,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004646105275227094
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19753768298479737,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003907531687545351
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.18364695968481887,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005076299243812921
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3699350432753926,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004721581387764664
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.20241133896176328,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004042906289046454
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b6425325f4625e4d254a6f53d5099a95e0af071
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9900707197163517,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.031011845880251587
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.21624013657641925,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0057958787897655845
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.4006689051971167,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005025377089358478
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.231391578826814,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004602731955828627
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11741133357899738,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0038707531452077852
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21636504595775147,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003993049382207552
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.12321787830184827,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0031814702327693906
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1866671922061064,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004886171018823189
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3671506160865042,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004552611209109354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.20337113659463418,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003830528444749543
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.19479702587195674,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00516733338200102
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3738824304063063,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004627178059919308
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.2102687170624152,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0040377255274080814
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cf985263a7144ba482cb4d253771f4e0c63816c
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.1661469259713872,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05442011585382653
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.23542620246457463,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006120612746647669
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.4126382803733479,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004969059062845303
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2462703929154201,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004784081954132341
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.13414542688077327,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004317950948756051
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.22644337464507955,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00400180902077526
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.13531236031865498,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003419155242253958
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.20531156621874325,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005277463604539717
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3773305254767969,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004505345415815039
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.21699579683860706,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004030577049307887
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.21360232440350232,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005527478664092985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.38494058675880927,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004590859610063705
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.22427477226716616,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004231158968354307
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3268f0a5ac4537c3966c1cf23c98c06053d4dfb2
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.13048152009975786,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029945327745025297
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.17865577403787464,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033881835064667153
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.13282629414914784,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025194987202168846
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.03018125254771904,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009701597241428515
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.044665240253023694,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001413086106707342
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.033116246346807675,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010039942081565677
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.10254296237637063,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0024976516791238405
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1412543271300389,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026741023650489657
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.10296911367521405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018747375341668269
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.12215023147917263,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0028728571931844558
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.1662465685102492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031683211934688273
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.12355432651994205,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023550817743274908
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.744610721855112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.16526552066121028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bda712af9935fed5be9af37614650ffedbdf68d9
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.24565174971014234,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037427166308021094
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.21184490359364247,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027978242331130445
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.18959676949532062,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022208786444120037
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.064292697870659,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00223313142118904
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.05044989353111514,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00149356139860852
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.04555610212149096,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012842639383539272
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.19090785019208276,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031096615547551243
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.16233189417978816,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021666821757260154
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.14459077408940324,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016809419269475205
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.23112890188861118,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003571687360942092
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19926382429471512,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026476455852546383
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.1779721097148494,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002089075825231665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.8487850954748324,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07399043126258031
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d308d69a77ba75828ed0feceb39689ed4d3096b2
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.3177459530757474,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00400211201809437
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.23523949924863127,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027923501651691642
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.2255393163759458,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002203135213246267
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09785647917944716,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002624157152992091
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06533504155625053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00156036864094978
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06387789794595161,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013813734339407
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.24936441373065246,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034018286291394332
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.18082782092908864,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002197669620317046
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17360298467727694,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017280099368334454
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.30084337998110033,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038595850362637936
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.22147712923465016,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026291853666447025
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.21263985026820842,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020905527432427407
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.7186948773008828,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05546113136172655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bdae13bf80979b9a028b8a72c44676f4f76cb82
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2846746242566653,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004319925738331281
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1938876931146627,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030463601923653946
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19252198096891418,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025594189650328643
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.086213919205008,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0025532621303566427
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.05487592547354139,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015509508090563054
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.054477366161976716,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013706432857531726
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.22546828575157715,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036375567676432637
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15015796002510307,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00240340751959009
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1494133193527796,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020035425127221443
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2694034648043568,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004158098981958554
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.1821584435675664,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028693565561164385
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18112869368220622,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002417009296022553
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.687306446962641,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0664578324203411
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c7d30d7563da28c3f1d2fe88a50d4edd2d96502
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.0960156493415235,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003616614563173548
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.06015169027512941,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0023503642149532013
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06084731695924829,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002176933000207993
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.02926487891196175,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018445908136531816
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.016390097756710017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010106857547182186
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.01686088972765567,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009404003204068355
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07801821469361046,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00303848388489472
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.04729130821614451,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018488779124782556
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.048152202763597045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017262084858342547
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.09073940139372572,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0034537583670396933
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.05607109134810165,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0021842218596914128
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.05696906006963692,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020412224032817106
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.04668959419048288,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.007153039873759097
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc25f5f2a0968f181d6aadf3b5d2c1a8060482e8
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.016572326495735878,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0017108845418978154
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.009927306068576757,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0010319824087958868
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.01017010786697453,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009828813757946661
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.005362446572675429,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008727663926039747
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.002846016962133478,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0004240233213271509
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0029391304431817997,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004163042405148408
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.014042125577094597,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015259273674767392
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.008096887154439805,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.000857985946484465
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008341512107444696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.000825303231689821
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.015986982643131028,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0016701967608221248
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.009428039452194052,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0009838146973353556
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009670630558683124,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009355088060122658
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.2603158162943564e-14,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 6.751733488101547e-13
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c40c89cb5cc68c4645e0acae120065c6f02fdc55
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 8.153800130249518,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19718858686762447
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.14254055468502322,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032976710551722
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.21985285950008995,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004764464623700324
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.16625763147318187,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036077916351727156
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.07117779808817919,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018071269269227326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.11165748130728335,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0026568526220331497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.0834337726594754,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0019832478132023163
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.11919497082898337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0027419572678360447
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.18525665107433087,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0040358655783698395
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.13948285594182427,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003021076305392557
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.12480961931955924,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002918263840706436
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.19394917883355808,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004289318267212664
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.1461396427981472,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003226960516681579
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c21f4a2702bd32dc0d2a9f47d743737bda483fd
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 11.82816578683988,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10613794908694005
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5741208333690715,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032791825020937275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.43616299285547877,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029918336223318646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4695495704689584,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002364549723893871
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.27417940276408975,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002745882165587474
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2046937372687625,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00215463330838664
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.22091389030064756,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020601818538796947
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4168038088426164,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029886435691974275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3132001002805105,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024293669614656285
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3383298269843499,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002100531054867843
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4669926334976919,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032045235446491534
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3531706097054418,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027268177176508718
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.38079591964268383,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00234065536986238
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ef3308d09c915646509a3d4a4841695d04d6bca
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.210994141328136,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14395128449450548
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5957938406647506,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032189851910091535
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4666858600777237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029439118893739937
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.4992077922612147,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023268998699713796
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.29925246097428815,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028135040853843815
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2311260083068878,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022842743345053117
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.24745510003725912,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021577622824456
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43644170204107635,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030000260097151686
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.339812800549829,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024913510362416656
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36403535470109827,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002158659011059573
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4937561615312527,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031994326527389665
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3863879597392378,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002778922240841303
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4134385994397868,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002387535627435029
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e8ef48cd287121e6293d1fa9eacd074274c65c7
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.758419275441623,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14846553640981838
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5969365877168988,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003176934049191526
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47321803649209376,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028949729129975907
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5048691947729075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022904626291377636
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.3021673329464843,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002773447672105312
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23749889827974513,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023473465141403524
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2531409526666904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021887540705134477
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43729156400302177,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003007804981399626
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34523784521876366,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025266731757362768
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36861382037574736,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002208485032472364
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4949310693592707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031940660026111094
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39247584122491685,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027974990919393808
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4186873543720982,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024264140364972063
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d5dbde08f110dc74fa41a5a0fc5afc8b7b73c22
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.9471441955308,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20049087654668654
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5964162408645288,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031670174643345854
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.476041597124549,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002870553977627495
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5068439147529751,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022702971756168133
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30318643893717323,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027688869090984037
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2391458905396845,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00231587546078518
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2547260269260162,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021693766012080536
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43667748453350236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030068254256241543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3473159987716765,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025367602883456315
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3700166988328032,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022206494000581454
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.495394213066299,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032160309119094407
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3958929166908767,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002827379825231143
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4212470848260628,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024487659358586895
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fcd7c40bf2f43b3826670513d58d005f59791f6
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.12300988029553,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.27781770916239995
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5954584960080906,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.00325049366305589
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47888879196863693,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028429629695053142
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5094338990091782,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023180695212366305
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30380630927932106,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002835445529306852
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.2402751342075056,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00226887033631254
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25642542561601567,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021909983236262304
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4360299975558518,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029988845638560017
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34982928906069083,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002474832286550078
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37228821193907236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002208419488038826
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4966033704013741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032394543771446537
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39901987621454466,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027430779669785923
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4247238232753859,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024321405233947197
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_0.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a643a4015ba8836bfab23b16ba9631a3016efa82
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.14108556952980922,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0020956329882564906
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.32629804825161535,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004756174570847187
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.1938850065877847,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00274725146396725
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.03392541732989178,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011674749125079006
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.08137541100012088,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00280912091771862
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04697902966116714,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001582424189239984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.10765433273641108,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0015745202745112603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.25042455718394274,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0036806317265347037
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.14812739050025472,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002066618016210627
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.11258249311328207,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0017981489130128542
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.26182933742867265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004140212650666321
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15484724382764298,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023502913687865295
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.8671229080125915,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08065653508825953
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_1.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..010b4fa7dc3364fddd18a3fee7999b3055338b58
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1818208364925118,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0033811439409721453
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2952342650035309,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004323771519378331
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2072547434350325,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029185590817686906
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.039255544280436615,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0017970345435637744
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06401578769657801,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024164697736714135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04454429056503735,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0017367902881785364
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.13745172437510383,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0026008926362907463
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.22412058258654055,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033458865795404046
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.15664138971417202,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002216089409249802
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1399352840204044,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0026319234043525545
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.23050320162406268,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0036405761255609054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1602204327991172,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023319587866224892
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7643751753804513,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0800024640836125
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_2.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..74e4a6dcc577f6a3172fc3e7d05249cfcc20fb1e
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.2472505852052675,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004009850648659755
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2567441304949634,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038189444756239026
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23644320153754822,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003260843172949545
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.056866160489061936,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002420240118413503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05783181540065835,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0022927551175836464
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05354419366516433,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002106078230291637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.18570698243499414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003303016306234649
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.19264012368473676,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030469236090590076
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1770324612357477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002634383182795547
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.18746784009032694,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003301909604388057
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.196059514574862,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003205680714038162
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17924796506451007,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026665867584947088
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.705819603984223,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.14137667865787865
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_3.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0665587033f09a6a5fd4800ed0f3c751a69e79b3
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.25083050440215976,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004340710144695701
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.24313830962431818,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003987068682586417
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23375535546317172,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0036448754340216603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.058302376165758424,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002462019491771468
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.056822850129455496,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002365353936727513
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05453694888387553,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0022209047576266894
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1865779794802579,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003475251260232787
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.1815458312356556,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031953873575430005
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.17375448749498826,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002893029439925038
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1872796185903397,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0034817440259912887
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.18253965500420624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003256036671696088
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17450339640702453,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002909832621947404
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 3.1173819151981363,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.23709102615992358
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_4.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..10b4834eefa9bab8ef3cc063d119ea6fe6e7b4e1
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.06668389338088138,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004287951717923472
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.05770356648149944,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0035956507787910136
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05594753186729229,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033676619143302084
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.014963307619343467,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016114455998580692
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.013741449593915791,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014260391208111062
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.013296321991506897,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013642061125793434
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.052652024080707335,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003605493741125939
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.044415851405158295,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028463083421698503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04310400237676907,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026639130697933863
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.053307788896004335,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003626638986661626
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.04533527259366541,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0029075081420929395
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04383519297886595,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002700540506868496
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.15136892255345546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.037709514860432364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_5.json b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fe0790ba4ade3aa008c94b91ce73b4593e8670c
--- /dev/null
+++ b/4b284b21boscar/evaluation/generation/slim.4b284b21boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.00273620898511371,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007798516293741871
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.002264558087005939,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0006160940919096034
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0024320284759234414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0006736354616386897
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0002196833564758093,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00011015503412814092
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00016199733180865256,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 8.184314454744501e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.00018583265458717857,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 9.342731096527664e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.002119565890814211,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005829990564075634
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.001776402682704595,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00046929937422078794
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0018932762031196894,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0005068809117616748
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.00220984304166733,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0006273058335995559
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0018523030709016646,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0005016261609450595
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0019743733562370912,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.000543487074589739
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 4.919560458856041e-39,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 3.5763240226586303e-34
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b21boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_0.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd70d74ca27b8a09c713721da59972ad9aca7c89
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_0.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.344,
+            "acc_stderr": 0.015029633724408947
+        },
+        "anli_r2": {
+            "acc": 0.331,
+            "acc_stderr": 0.014888272588203941
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.013596836729485163
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.18803418803418803
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.04512608598542126
+        },
+        "hellaswag": {
+            "acc": 0.40370444134634537,
+            "acc_stderr": 0.00489636818576524,
+            "acc_norm": 0.5094602668791077,
+            "acc_norm_stderr": 0.004988888194063274
+        },
+        "rte": {
+            "acc": 0.5270758122743683,
+            "acc_stderr": 0.030052303463143706
+        },
+        "winogrande": {
+            "acc": 0.531965272296764,
+            "acc_stderr": 0.014023739221166384
+        },
+        "storycloze_2016": {
+            "acc": 0.6664885088188135,
+            "acc_stderr": 0.01090262138991413
+        },
+        "boolq": {
+            "acc": 0.573394495412844,
+            "acc_stderr": 0.008650327037726273
+        },
+        "arc_easy": {
+            "acc": 0.5643939393939394,
+            "acc_stderr": 0.010174341733665226,
+            "acc_norm": 0.5029461279461279,
+            "acc_norm_stderr": 0.01025960541623758
+        },
+        "arc_challenge": {
+            "acc": 0.25170648464163825,
+            "acc_stderr": 0.012682496334042961,
+            "acc_norm": 0.2790102389078498,
+            "acc_norm_stderr": 0.013106784883601333
+        },
+        "sciq": {
+            "acc": 0.844,
+            "acc_stderr": 0.011480235006122358,
+            "acc_norm": 0.765,
+            "acc_norm_stderr": 0.013414729030247114
+        },
+        "piqa": {
+            "acc": 0.7219804134929271,
+            "acc_stderr": 0.01045311735833281,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.0104341623882756
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd70d74ca27b8a09c713721da59972ad9aca7c89
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.344,
+            "acc_stderr": 0.015029633724408947
+        },
+        "anli_r2": {
+            "acc": 0.331,
+            "acc_stderr": 0.014888272588203941
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.013596836729485163
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.18803418803418803
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.04512608598542126
+        },
+        "hellaswag": {
+            "acc": 0.40370444134634537,
+            "acc_stderr": 0.00489636818576524,
+            "acc_norm": 0.5094602668791077,
+            "acc_norm_stderr": 0.004988888194063274
+        },
+        "rte": {
+            "acc": 0.5270758122743683,
+            "acc_stderr": 0.030052303463143706
+        },
+        "winogrande": {
+            "acc": 0.531965272296764,
+            "acc_stderr": 0.014023739221166384
+        },
+        "storycloze_2016": {
+            "acc": 0.6664885088188135,
+            "acc_stderr": 0.01090262138991413
+        },
+        "boolq": {
+            "acc": 0.573394495412844,
+            "acc_stderr": 0.008650327037726273
+        },
+        "arc_easy": {
+            "acc": 0.5643939393939394,
+            "acc_stderr": 0.010174341733665226,
+            "acc_norm": 0.5029461279461279,
+            "acc_norm_stderr": 0.01025960541623758
+        },
+        "arc_challenge": {
+            "acc": 0.25170648464163825,
+            "acc_stderr": 0.012682496334042961,
+            "acc_norm": 0.2790102389078498,
+            "acc_norm_stderr": 0.013106784883601333
+        },
+        "sciq": {
+            "acc": 0.844,
+            "acc_stderr": 0.011480235006122358,
+            "acc_norm": 0.765,
+            "acc_norm_stderr": 0.013414729030247114
+        },
+        "piqa": {
+            "acc": 0.7219804134929271,
+            "acc_stderr": 0.01045311735833281,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.0104341623882756
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_1.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d2c9c6c53ce4de4114800d881e7e06d9934f914
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_1.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.323,
+            "acc_stderr": 0.014794927843348633
+        },
+        "anli_r2": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620342
+        },
+        "anli_r3": {
+            "acc": 0.33,
+            "acc_stderr": 0.013579531277800923
+        },
+        "cb": {
+            "acc": 0.48214285714285715,
+            "acc_stderr": 0.0673769750864465,
+            "f1": 0.3268398268398269
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40360485958972314,
+            "acc_stderr": 0.004896173035943315,
+            "acc_norm": 0.5180242979486158,
+            "acc_norm_stderr": 0.004986538243846636
+        },
+        "rte": {
+            "acc": 0.555956678700361,
+            "acc_stderr": 0.02990739633379599
+        },
+        "winogrande": {
+            "acc": 0.5556432517758485,
+            "acc_stderr": 0.013965196769083555
+        },
+        "storycloze_2016": {
+            "acc": 0.6627471940138963,
+            "acc_stderr": 0.010932788119436439
+        },
+        "boolq": {
+            "acc": 0.5694189602446483,
+            "acc_stderr": 0.008660360145988744
+        },
+        "arc_easy": {
+            "acc": 0.577020202020202,
+            "acc_stderr": 0.010137328382209094,
+            "acc_norm": 0.5631313131313131,
+            "acc_norm_stderr": 0.010177672928157694
+        },
+        "arc_challenge": {
+            "acc": 0.27047781569965873,
+            "acc_stderr": 0.012980954547659554,
+            "acc_norm": 0.2977815699658703,
+            "acc_norm_stderr": 0.013363080107244487
+        },
+        "sciq": {
+            "acc": 0.905,
+            "acc_stderr": 0.009276910103103305,
+            "acc_norm": 0.899,
+            "acc_norm_stderr": 0.009533618929340997
+        },
+        "piqa": {
+            "acc": 0.7241566920565833,
+            "acc_stderr": 0.010427805502729115,
+            "acc_norm": 0.719804134929271,
+            "acc_norm_stderr": 0.010478122015577095
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d2c9c6c53ce4de4114800d881e7e06d9934f914
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.323,
+            "acc_stderr": 0.014794927843348633
+        },
+        "anli_r2": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620342
+        },
+        "anli_r3": {
+            "acc": 0.33,
+            "acc_stderr": 0.013579531277800923
+        },
+        "cb": {
+            "acc": 0.48214285714285715,
+            "acc_stderr": 0.0673769750864465,
+            "f1": 0.3268398268398269
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40360485958972314,
+            "acc_stderr": 0.004896173035943315,
+            "acc_norm": 0.5180242979486158,
+            "acc_norm_stderr": 0.004986538243846636
+        },
+        "rte": {
+            "acc": 0.555956678700361,
+            "acc_stderr": 0.02990739633379599
+        },
+        "winogrande": {
+            "acc": 0.5556432517758485,
+            "acc_stderr": 0.013965196769083555
+        },
+        "storycloze_2016": {
+            "acc": 0.6627471940138963,
+            "acc_stderr": 0.010932788119436439
+        },
+        "boolq": {
+            "acc": 0.5694189602446483,
+            "acc_stderr": 0.008660360145988744
+        },
+        "arc_easy": {
+            "acc": 0.577020202020202,
+            "acc_stderr": 0.010137328382209094,
+            "acc_norm": 0.5631313131313131,
+            "acc_norm_stderr": 0.010177672928157694
+        },
+        "arc_challenge": {
+            "acc": 0.27047781569965873,
+            "acc_stderr": 0.012980954547659554,
+            "acc_norm": 0.2977815699658703,
+            "acc_norm_stderr": 0.013363080107244487
+        },
+        "sciq": {
+            "acc": 0.905,
+            "acc_stderr": 0.009276910103103305,
+            "acc_norm": 0.899,
+            "acc_norm_stderr": 0.009533618929340997
+        },
+        "piqa": {
+            "acc": 0.7241566920565833,
+            "acc_stderr": 0.010427805502729115,
+            "acc_norm": 0.719804134929271,
+            "acc_norm_stderr": 0.010478122015577095
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_2.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3be64ee69f19e69e66ca7f32027f85f2fd07ba3d
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_2.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620347
+        },
+        "anli_r2": {
+            "acc": 0.348,
+            "acc_stderr": 0.015070604603768408
+        },
+        "anli_r3": {
+            "acc": 0.335,
+            "acc_stderr": 0.013630871843821476
+        },
+        "cb": {
+            "acc": 0.48214285714285715,
+            "acc_stderr": 0.0673769750864465,
+            "f1": 0.3373075012419275
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40440151364270066,
+            "acc_stderr": 0.004897728370737246,
+            "acc_norm": 0.5218084047002589,
+            "acc_norm_stderr": 0.004985032806802431
+        },
+        "rte": {
+            "acc": 0.4981949458483754,
+            "acc_stderr": 0.030096267148976633
+        },
+        "winogrande": {
+            "acc": 0.5485398579321231,
+            "acc_stderr": 0.013986110301017759
+        },
+        "storycloze_2016": {
+            "acc": 0.6616782469267771,
+            "acc_stderr": 0.010941266252293478
+        },
+        "boolq": {
+            "acc": 0.5636085626911315,
+            "acc_stderr": 0.00867400046743208
+        },
+        "arc_easy": {
+            "acc": 0.5837542087542088,
+            "acc_stderr": 0.01011481940450087,
+            "acc_norm": 0.5765993265993266,
+            "acc_norm_stderr": 0.010138671005289049
+        },
+        "arc_challenge": {
+            "acc": 0.28242320819112626,
+            "acc_stderr": 0.013155456884097222,
+            "acc_norm": 0.30119453924914674,
+            "acc_norm_stderr": 0.013406741767847626
+        },
+        "sciq": {
+            "acc": 0.909,
+            "acc_stderr": 0.009099549538400246,
+            "acc_norm": 0.909,
+            "acc_norm_stderr": 0.009099549538400238
+        },
+        "piqa": {
+            "acc": 0.7263329706202394,
+            "acc_stderr": 0.010402184206229206,
+            "acc_norm": 0.7187159956474428,
+            "acc_norm_stderr": 0.010490509832327423
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..3be64ee69f19e69e66ca7f32027f85f2fd07ba3d
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620347
+        },
+        "anli_r2": {
+            "acc": 0.348,
+            "acc_stderr": 0.015070604603768408
+        },
+        "anli_r3": {
+            "acc": 0.335,
+            "acc_stderr": 0.013630871843821476
+        },
+        "cb": {
+            "acc": 0.48214285714285715,
+            "acc_stderr": 0.0673769750864465,
+            "f1": 0.3373075012419275
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40440151364270066,
+            "acc_stderr": 0.004897728370737246,
+            "acc_norm": 0.5218084047002589,
+            "acc_norm_stderr": 0.004985032806802431
+        },
+        "rte": {
+            "acc": 0.4981949458483754,
+            "acc_stderr": 0.030096267148976633
+        },
+        "winogrande": {
+            "acc": 0.5485398579321231,
+            "acc_stderr": 0.013986110301017759
+        },
+        "storycloze_2016": {
+            "acc": 0.6616782469267771,
+            "acc_stderr": 0.010941266252293478
+        },
+        "boolq": {
+            "acc": 0.5636085626911315,
+            "acc_stderr": 0.00867400046743208
+        },
+        "arc_easy": {
+            "acc": 0.5837542087542088,
+            "acc_stderr": 0.01011481940450087,
+            "acc_norm": 0.5765993265993266,
+            "acc_norm_stderr": 0.010138671005289049
+        },
+        "arc_challenge": {
+            "acc": 0.28242320819112626,
+            "acc_stderr": 0.013155456884097222,
+            "acc_norm": 0.30119453924914674,
+            "acc_norm_stderr": 0.013406741767847626
+        },
+        "sciq": {
+            "acc": 0.909,
+            "acc_stderr": 0.009099549538400246,
+            "acc_norm": 0.909,
+            "acc_norm_stderr": 0.009099549538400238
+        },
+        "piqa": {
+            "acc": 0.7263329706202394,
+            "acc_stderr": 0.010402184206229206,
+            "acc_norm": 0.7187159956474428,
+            "acc_norm_stderr": 0.010490509832327423
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_3.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..30b8bcd13d134dd1b7e310264f3fe9980242e494
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_3.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.311,
+            "acc_stderr": 0.014645596385722692
+        },
+        "anli_r2": {
+            "acc": 0.335,
+            "acc_stderr": 0.014933117490932572
+        },
+        "anli_r3": {
+            "acc": 0.3408333333333333,
+            "acc_stderr": 0.013688600793296939
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.067031892279424,
+            "f1": 0.3012820512820513
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768078
+        },
+        "hellaswag": {
+            "acc": 0.40509858593905596,
+            "acc_stderr": 0.00489907830018425,
+            "acc_norm": 0.522903804023103,
+            "acc_norm_stderr": 0.004984543540932339
+        },
+        "rte": {
+            "acc": 0.5018050541516246,
+            "acc_stderr": 0.030096267148976633
+        },
+        "winogrande": {
+            "acc": 0.5619573796369376,
+            "acc_stderr": 0.013944181296470804
+        },
+        "storycloze_2016": {
+            "acc": 0.6745056119722074,
+            "acc_stderr": 0.010835369677013443
+        },
+        "boolq": {
+            "acc": 0.5553516819571865,
+            "acc_stderr": 0.008691303433317494
+        },
+        "arc_easy": {
+            "acc": 0.5833333333333334,
+            "acc_stderr": 0.010116282977781242,
+            "acc_norm": 0.5816498316498316,
+            "acc_norm_stderr": 0.010122061470742853
+        },
+        "arc_challenge": {
+            "acc": 0.2790102389078498,
+            "acc_stderr": 0.013106784883601343,
+            "acc_norm": 0.30631399317406144,
+            "acc_norm_stderr": 0.013470584417276513
+        },
+        "sciq": {
+            "acc": 0.913,
+            "acc_stderr": 0.008916866630745921,
+            "acc_norm": 0.908,
+            "acc_norm_stderr": 0.009144376393151118
+        },
+        "piqa": {
+            "acc": 0.7285092491838956,
+            "acc_stderr": 0.010376251176596135,
+            "acc_norm": 0.7176278563656148,
+            "acc_norm_stderr": 0.010502821668555356
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..30b8bcd13d134dd1b7e310264f3fe9980242e494
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.311,
+            "acc_stderr": 0.014645596385722692
+        },
+        "anli_r2": {
+            "acc": 0.335,
+            "acc_stderr": 0.014933117490932572
+        },
+        "anli_r3": {
+            "acc": 0.3408333333333333,
+            "acc_stderr": 0.013688600793296939
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.067031892279424,
+            "f1": 0.3012820512820513
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768078
+        },
+        "hellaswag": {
+            "acc": 0.40509858593905596,
+            "acc_stderr": 0.00489907830018425,
+            "acc_norm": 0.522903804023103,
+            "acc_norm_stderr": 0.004984543540932339
+        },
+        "rte": {
+            "acc": 0.5018050541516246,
+            "acc_stderr": 0.030096267148976633
+        },
+        "winogrande": {
+            "acc": 0.5619573796369376,
+            "acc_stderr": 0.013944181296470804
+        },
+        "storycloze_2016": {
+            "acc": 0.6745056119722074,
+            "acc_stderr": 0.010835369677013443
+        },
+        "boolq": {
+            "acc": 0.5553516819571865,
+            "acc_stderr": 0.008691303433317494
+        },
+        "arc_easy": {
+            "acc": 0.5833333333333334,
+            "acc_stderr": 0.010116282977781242,
+            "acc_norm": 0.5816498316498316,
+            "acc_norm_stderr": 0.010122061470742853
+        },
+        "arc_challenge": {
+            "acc": 0.2790102389078498,
+            "acc_stderr": 0.013106784883601343,
+            "acc_norm": 0.30631399317406144,
+            "acc_norm_stderr": 0.013470584417276513
+        },
+        "sciq": {
+            "acc": 0.913,
+            "acc_stderr": 0.008916866630745921,
+            "acc_norm": 0.908,
+            "acc_norm_stderr": 0.009144376393151118
+        },
+        "piqa": {
+            "acc": 0.7285092491838956,
+            "acc_stderr": 0.010376251176596135,
+            "acc_norm": 0.7176278563656148,
+            "acc_norm_stderr": 0.010502821668555356
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_4.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8be5a0114e7ffbfc5fd1ea12eea58ca8f00aa5b7
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_4.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.317,
+            "acc_stderr": 0.014721675438880226
+        },
+        "anli_r2": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620345
+        },
+        "anli_r3": {
+            "acc": 0.325,
+            "acc_stderr": 0.013526454480351016
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942397,
+            "f1": 0.3929292929292929
+        },
+        "copa": {
+            "acc": 0.77,
+            "acc_stderr": 0.04229525846816507
+        },
+        "hellaswag": {
+            "acc": 0.40440151364270066,
+            "acc_stderr": 0.0048977283707372496,
+            "acc_norm": 0.5226050587532364,
+            "acc_norm_stderr": 0.004984679359375623
+        },
+        "rte": {
+            "acc": 0.48375451263537905,
+            "acc_stderr": 0.030080573208738064
+        },
+        "winogrande": {
+            "acc": 0.5501183898973955,
+            "acc_stderr": 0.013981711904049733
+        },
+        "storycloze_2016": {
+            "acc": 0.6793158738642437,
+            "acc_stderr": 0.01079328909592361
+        },
+        "boolq": {
+            "acc": 0.5428134556574924,
+            "acc_stderr": 0.008712936764296237
+        },
+        "arc_easy": {
+            "acc": 0.5900673400673401,
+            "acc_stderr": 0.010091953527506251,
+            "acc_norm": 0.5909090909090909,
+            "acc_norm_stderr": 0.010088775152615786
+        },
+        "arc_challenge": {
+            "acc": 0.27047781569965873,
+            "acc_stderr": 0.012980954547659556,
+            "acc_norm": 0.30631399317406144,
+            "acc_norm_stderr": 0.013470584417276513
+        },
+        "sciq": {
+            "acc": 0.919,
+            "acc_stderr": 0.00863212103213998,
+            "acc_norm": 0.914,
+            "acc_norm_stderr": 0.008870325962594766
+        },
+        "piqa": {
+            "acc": 0.7285092491838956,
+            "acc_stderr": 0.010376251176596137,
+            "acc_norm": 0.7257889009793254,
+            "acc_norm_stderr": 0.010408618664933382
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..8be5a0114e7ffbfc5fd1ea12eea58ca8f00aa5b7
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.317,
+            "acc_stderr": 0.014721675438880226
+        },
+        "anli_r2": {
+            "acc": 0.339,
+            "acc_stderr": 0.014976758771620345
+        },
+        "anli_r3": {
+            "acc": 0.325,
+            "acc_stderr": 0.013526454480351016
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942397,
+            "f1": 0.3929292929292929
+        },
+        "copa": {
+            "acc": 0.77,
+            "acc_stderr": 0.04229525846816507
+        },
+        "hellaswag": {
+            "acc": 0.40440151364270066,
+            "acc_stderr": 0.0048977283707372496,
+            "acc_norm": 0.5226050587532364,
+            "acc_norm_stderr": 0.004984679359375623
+        },
+        "rte": {
+            "acc": 0.48375451263537905,
+            "acc_stderr": 0.030080573208738064
+        },
+        "winogrande": {
+            "acc": 0.5501183898973955,
+            "acc_stderr": 0.013981711904049733
+        },
+        "storycloze_2016": {
+            "acc": 0.6793158738642437,
+            "acc_stderr": 0.01079328909592361
+        },
+        "boolq": {
+            "acc": 0.5428134556574924,
+            "acc_stderr": 0.008712936764296237
+        },
+        "arc_easy": {
+            "acc": 0.5900673400673401,
+            "acc_stderr": 0.010091953527506251,
+            "acc_norm": 0.5909090909090909,
+            "acc_norm_stderr": 0.010088775152615786
+        },
+        "arc_challenge": {
+            "acc": 0.27047781569965873,
+            "acc_stderr": 0.012980954547659556,
+            "acc_norm": 0.30631399317406144,
+            "acc_norm_stderr": 0.013470584417276513
+        },
+        "sciq": {
+            "acc": 0.919,
+            "acc_stderr": 0.00863212103213998,
+            "acc_norm": 0.914,
+            "acc_norm_stderr": 0.008870325962594766
+        },
+        "piqa": {
+            "acc": 0.7285092491838956,
+            "acc_stderr": 0.010376251176596137,
+            "acc_norm": 0.7257889009793254,
+            "acc_norm_stderr": 0.010408618664933382
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_5.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ca4157c12985fee6d3beccaa4176660fbb7e781
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_5.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.345,
+            "acc_stderr": 0.015039986742055233
+        },
+        "anli_r2": {
+            "acc": 0.33,
+            "acc_stderr": 0.014876872027456736
+        },
+        "anli_r3": {
+            "acc": 0.3383333333333333,
+            "acc_stderr": 0.013664144006618268
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942397,
+            "f1": 0.2706349206349206
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40659231228838877,
+            "acc_stderr": 0.0049019365115461205,
+            "acc_norm": 0.5252937661820355,
+            "acc_norm_stderr": 0.004983392650570959
+        },
+        "rte": {
+            "acc": 0.5415162454873647,
+            "acc_stderr": 0.029992535385373314
+        },
+        "winogrande": {
+            "acc": 0.5453827940015785,
+            "acc_stderr": 0.013994481027065998
+        },
+        "storycloze_2016": {
+            "acc": 0.6718332442544094,
+            "acc_stderr": 0.010858184920580584
+        },
+        "boolq": {
+            "acc": 0.5311926605504587,
+            "acc_stderr": 0.008728020822889253
+        },
+        "arc_easy": {
+            "acc": 0.5875420875420876,
+            "acc_stderr": 0.010101305447864778,
+            "acc_norm": 0.5845959595959596,
+            "acc_norm_stderr": 0.010111869494911512
+        },
+        "arc_challenge": {
+            "acc": 0.2815699658703072,
+            "acc_stderr": 0.013143376735009024,
+            "acc_norm": 0.310580204778157,
+            "acc_norm_stderr": 0.013522292098053054
+        },
+        "sciq": {
+            "acc": 0.91,
+            "acc_stderr": 0.00905439020486644,
+            "acc_norm": 0.91,
+            "acc_norm_stderr": 0.00905439020486644
+        },
+        "piqa": {
+            "acc": 0.7252448313384113,
+            "acc_stderr": 0.010415033676676042,
+            "acc_norm": 0.7219804134929271,
+            "acc_norm_stderr": 0.010453117358332828
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/evaluation/rankeval/4b284b21boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ca4157c12985fee6d3beccaa4176660fbb7e781
--- /dev/null
+++ b/4b284b21boscar/evaluation/rankeval/4b284b21boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.345,
+            "acc_stderr": 0.015039986742055233
+        },
+        "anli_r2": {
+            "acc": 0.33,
+            "acc_stderr": 0.014876872027456736
+        },
+        "anli_r3": {
+            "acc": 0.3383333333333333,
+            "acc_stderr": 0.013664144006618268
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942397,
+            "f1": 0.2706349206349206
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40659231228838877,
+            "acc_stderr": 0.0049019365115461205,
+            "acc_norm": 0.5252937661820355,
+            "acc_norm_stderr": 0.004983392650570959
+        },
+        "rte": {
+            "acc": 0.5415162454873647,
+            "acc_stderr": 0.029992535385373314
+        },
+        "winogrande": {
+            "acc": 0.5453827940015785,
+            "acc_stderr": 0.013994481027065998
+        },
+        "storycloze_2016": {
+            "acc": 0.6718332442544094,
+            "acc_stderr": 0.010858184920580584
+        },
+        "boolq": {
+            "acc": 0.5311926605504587,
+            "acc_stderr": 0.008728020822889253
+        },
+        "arc_easy": {
+            "acc": 0.5875420875420876,
+            "acc_stderr": 0.010101305447864778,
+            "acc_norm": 0.5845959595959596,
+            "acc_norm_stderr": 0.010111869494911512
+        },
+        "arc_challenge": {
+            "acc": 0.2815699658703072,
+            "acc_stderr": 0.013143376735009024,
+            "acc_norm": 0.310580204778157,
+            "acc_norm_stderr": 0.013522292098053054
+        },
+        "sciq": {
+            "acc": 0.91,
+            "acc_stderr": 0.00905439020486644,
+            "acc_norm": 0.91,
+            "acc_norm_stderr": 0.00905439020486644
+        },
+        "piqa": {
+            "acc": 0.7252448313384113,
+            "acc_stderr": 0.010415033676676042,
+            "acc_norm": 0.7219804134929271,
+            "acc_norm_stderr": 0.010453117358332828
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c7d10c57d7f7737dea0dd76d77f79fb160052826
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2992a9601d90d4edb9f7bbbc841dc5d394169dc7e971a575198cabc21e1f7b1e
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb95375e8d928adc8cccda75a8e9ae3cebc2b455
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94a5a3940d84e0993ebc73364e17893d1d75911babd0e15b24ae1cdc3fae080d
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dfc915ebcb1c51ab980882aa1b4b1a5311a2cd53
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5fbd68b736465c473b558e6e1ac7d9063d49e557c25891a0da47ef674bf51d5
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c77455e4f14f202a5d4ca5770d05729f5ff913ae
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9681e3e7d84116bb4a8b07b1842abcdf36257cc585e621c7048f771091cc58f3
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c36d72937207ac52b6029c004fc350a3ef26ac2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:284ba9bc7d64c23ce999a30029f8725f6b17637f29be70e2abf717610e1f44ba
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..17c2a7a8621344254f44f9242d67cca849503945
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd661b5f93f5a96ce466400d99101f1d0bffdd0be6fb17fe3c66080f1f403517
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cafd86f6200dd55e987af7437c34033347861b83
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e53a5e8d94f5698a0cd3b12124790d8e04611f13cbc23210931beb4523c698d4
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..700c35b1e0409ef0be073f0bd0d875e2f0cf0187
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:280bb6e1044e5820f121a040b237db620664aeee678a19cff96611ed7f2e3708
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b3c299d474a1eba873508f15b898cabdc11bee8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b4b2503a1ec5c6770a8b664deb172837ce31f0707c34bc93f736ddc2993c621
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5d9b92f99515bbe2d38de16300ae6295b68114e1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c593f643d264d67c35b8ba119cfe3f672ac662d8bd76ee9535470163d1a0aa92
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f4429dc688455e0b79dcdd6367170920fef973e1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba4b4b54ed6bce1aaeb42a9d1e47e9931bb7e0457d917031f3da859db822f4a8
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..405cf60c52aa0eafbc45ebb606091be618918bc0
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59963ee001ff8f99ddcb5d7c6d4a9c9697c45cb2eee2ba3eb1269ba1d3c1ebee
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fab1e770fa630b0cb36ee7e7a47835ba017843c5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1f0d1e069ebde4d9f79fd975a3ada75baf385efd19144d823ad91cdc6380ce2
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9391c0d1ee447a547d772d8eb34012148f099041
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:013905c67823c0cfc67ab121a592ca76a2706d93779a3239d83f42309e5d1f78
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8087c5d4b1b59ee7b499c527efcd4909e5672aa9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b29dc7abfea12c542ddcd62aa5dde9b81cc919c4dd0b6ac697038eb69491093
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b6ec3478f76d09d122d2d75307677891df127c8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5c05f464ae2041a170385ebdd7ba3dfa502851e2e121220f72cb1035a2c0e04
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..372e095a07b57a4b9e94942ec7d70ff034de1867
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a072208ecb9587d911fc3548cb08d109b97842128953adecf86da39247410eb9
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3fd732020b88a6ac98794dc57808ac2d6bbdf4f4
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26bb9efcb1bb180a26f37a028202cc94cea048309b6bba8be4f6aa237a62d73b
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcd8ca0166860da6e83349f17f0b7985f81cfb12
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6c4d69d39056db9f8b493716f19e04c3a2f7d30fe31835eee643a5519b81406
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..996ca6a5917b7023a2b0e73bdf6bed51aaa20ca2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3df690fcd441f825b1c33b7589d3149a22b3093888bd61cbac807c5946d0ab2a
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a8902bfde9557ecd066d58263bbfb916763295c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1460963e2a49117dcc3b4c25786466d6bfba51988e5792684e74ddef08a1d05
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..159b566515917dd50be681c7d7add9477dda9996
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcaf4409dc43e45cfabc8c0dd3e063e08f6718fdcf61adda7a97cbf06da14694
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db836240b92faba1c48373e0ea4a827d6fe1246a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd3701491137d5d0adfbbacd615cbde31632863a9374af63c4ddfb0e5d92e258
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..971c8453f9e8ec4bd0111fe5ee065b6be9765b99
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4baf87f2673092dd8c8c7f8b3c77455d1dedcbb4b9a5e86a259355e78ed02ef7
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8d6fe859f174e17e85f8c62cce14d021a7f5f7da
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2981fe366b15f21d072e69960c9a65c2b0e00be5c463b3e5a179ca93ffbcdb99
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0e2c571c4ea80f709ef757fe71c16ac5878464d1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4ea5fde43b34fd5ed0f4f44fde638d5875159b42cea36c936583e20ddef5ec1
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fbc736a20cf9545f8da9c3ac6b6a330344043518
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:534bb584590f91bb3f5f11369d46c038f83a202bf2a6cb0d58f421b3a8dce0bf
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..da41bec56d51e9ba3f20fb4cf52f16b80415cc5f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0eae9155ae05d9860bf4a84be14441223af8e4b5da0dea1f54375337be615e07
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..997606a81bba2d4a4384cba183081462ca8eeb3d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:553736b85fd22fbcd35e89da1b9e3058a9e530e6149da75be66273ffd1906d8d
+size 199058605
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ce3ae9abba540217d4a6ff24f62f5e53fe2c0765
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:692fac6732772cf6422b87480ffdf6748670a6799e3145c935b115edea0d144b
+size 199058605
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..087e48af2953641bd701930db4183aeb1becdf05
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb6bbc21568db914c9f72e4606d30139df79a465fe49d26777352ea7f60d1c8
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ffc31e817962ab15d13cbcf631c376ef432ae78f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b1e72d7a3ae660054a82ce001b6b6d855f6a1a6d7e0f135f68543b9bce3dc8c
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2d586de116b5ae68a1898405b37c06862d686b27
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2a043144d4e63226e2c2547f2ba8cc26a39494bf6bccf68647a692e3dcaf58d
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..808dec2a80a26d9c66f0e0a0b2ed352cd526cc87
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42917f0a49c48fe3b26d7e4f6d070c55893cb4fa0b82b3add8d5ec8a630a4871
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b27d4a96a5be2e7f37be1e6960260fbf56e9805e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d6a9e447df50be86b72788d1f4b8fb00d842a9b43067a23a3522d902fd02c7a
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e399860b98e4147bb2c1c4f8da9146eea77dc16
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25b4ae79ab2cfa5141695932bf6dd76847bb8eddc98a416f2d70575968349e83
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..155f8afdb8c809a8961a7725af9afa50ff165caa
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f934f1ab1827845c823cbc5e8d5360ba4e9b4b8a4b990f51a04e893261f9125
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cd30a7e6f7ee2fcf2b5538ff4b1445a7f50fd1da
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8b756914ef9882594909ff7db9191bdd02437100e6b1ab722f84f083b8df96f
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ff89b6ac8c028622d60370ab66c97157866e279
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e1691d6825a532b0c45a6caa77201695bb57af2695704a3af9fdb0dbeb5d875
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d53537ad1f090810e0d5af3576065efe0d8ca55e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdf6e22b98b7d21a1e2930ba49e15d328403b3fae9405d6bb5f673330ebe1257
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5c81803e5bd8b4285188c01d477d4f2c3be76b21
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2f087788335a03c2bc848ee5fae89618b0bebe6b9a0841c38539a08060fb1fb
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..273c5aa639e3ad9c9c3e7dc8b7fab55241774e25
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:322158e33956a48038a008723752f1a05c17203adc14f9789b8f1866e4ed9fc6
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd201f9de2eb35e8b3283cc69c1880e490356c9b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1a9b42fe65c9df378db292c0abc60551fd6d66a6f32953b050a41df92d215af
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca339a7b0ef5d2495f32ff0343a4d0ce89596c1e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd059437498728382676860c2583b99d587be9a93ff25067bb4ebed25622d056
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9f77d324b3ea2590956f3768ca103a0b8cc556f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b2c4f1356438366053b59f3652f301e01c4e4e66d3a3775ddc1aa35cbfc2a3e
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7be6558dc0d788746d521558326c7987c85a4ded
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69a01aceecd93057e0bfaea92f9f0521c72494aa90bc4488ba3a0bd2f7092374
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3402ff71fad65062deb7dd0f6b18fa622cae393
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d4ebbd0395e2f2fa7562b0f01e04cf0ee674e3cc681c301b59829df226172b6
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b7dd5a04f36c01df3d3a71aec1e3a1a2db2629d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcf0919001cd74b1a7f3699f79246f90870eba482dd6fb421437016145271f1b
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81ef0eebacf44b6e77ed0370cbc0b96af7eca665
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:482386bbbe58ec503fbe608904cc17a73a19357387bed0522a87cdf356fb7809
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0ad75c14a4f285b96628ef9c4ad147a1ca258a6
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48b9e3698ed201993a3284d0dc720453ec6691d2beb2912d0e324e69b6a8b5d6
+size 199058797
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..000d45818108eadaadb3419aa58856b14627d1b8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e94e3d1deccbc54e5e851ee3f07ac6b1e322373b185ae056be7fd7e7fcff092
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..966a5314e600e72cf9c1f1abcf98046ec60a9a0a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03a150df5ed0f4b41afbf731002afda0772bc1922bf0753eb87d40a22c7562d5
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c363959f8e12dddf6dd2f51764334a278fa8179c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72cc45ec3842c76b80c4ab94bc646c4c0d8672a57e2c5270f8b1e87e567fb25e
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08f3df80e27f150941850155c37d5df7d0ad5297
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b6236fa4129b508577a660b3e428fe1772d8ddae19f9f5fc2187c616dedaa2d
+size 199058733
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..12439defe97f591b9771f6bc0748626e612bdab9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d01a8fb68985b944e7f1ae8eb655697048a70f5e8587a90a21248a0236ff2a37
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2fd289e94a2341da7666a2ce3ec722fad096f82f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:528227eab2856d2e07f71aae109506ce7d8e28daffb25ec0775ef53cb925236d
+size 199058669
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e9fd711969bb9dbf6a5cd58dd529919dacefd82
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:389ef1100cc36f095beca2a19dbbb60856a3f0e17c75e9a32bbb131c90b98c8f
+size 199058925
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e9f3b0c1bd22c00de11811495adc4960fc766c8e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40e1c91093a47567aeddee8cb9cf329ab08391c3f6cacabe573f55e04195806d
+size 199058925
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..972fe29ebabf76343f6279bc0a7e0bb16b275c9e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fedee3a377f035b46fd213ef130ecc117c3e472b330451391ea8697e04307eee
+size 199058605
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e811f041c8d2d56d79dbe83c1b88f9a98d65997a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41e6e541fee62fc2997207f00aa58c7a2dfd344fa1042e737654e4437ce3519d
+size 199058605
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..40f2e7cd8e0dd4e8eff7a41f9428673449de6e22
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf7162723f3d5d28020e6efa7c88ac853fc4e44d0c83431aa7adb4a380167da6
+size 199058605
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f8b2a16348e2b649daab3506f219505c85913fb
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44f30da74db0125a9afef8759eaf03bfb2aff03c60102a5713b1966fbb63dd28
+size 199058605
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6c386d3e229d2b5125d5f8522ccfcb5f03f7c4a1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83372cdf02e20aea7e8ac74ed83a3ec40ca978a732fe5dab61138c4daf6c398e
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..136552e3d1a4bae6629766d3bd6560bce65fb62c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7b1322771c10ee3ab2cdf2fe19ff9014291162fa9a07c3ca82fb96c5442c22e
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..774eb896f22838028a7e7fb429b89d160012ba46
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86a99657b7bd768b0ef61b19d3f48ad3852c7a01db142ef20c86bc9cb653ac7c
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..987862095740771ac14cc21be2f9e12c61ae8e32
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93a1e1b466f153b66984a61c4a2f9345dcfeba05c7d7eb7b3f3ef8abf5207e40
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..60b01cdfb58438a9cc4e810440e356635e9ab6d2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019b374c381673d124e00e2b02fe1bd7aad9bdb1e7793cd973437e86ee33da0f
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e15baf58ca546734a398325d2dd9785110a4d714
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed9ad7fbad01d497066cfd7ac3d636ee5862a333efb11428301bff93afed24d8
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6f48616735706e14fd5ee8edf974732063418c0
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f238c2a8b0c35bfec81346ee8c97c79d285f11aa0c03003db3007bc959fa3f68
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5f209c482da54e3dd06fe9858734caf1cb0d302
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a4bcaf9909207f6833ce9d7eeb5ffd0797022dc7e9c0abc625613260b6fd030
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5aac1f086359277679813be851c2d55d5438bd2e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:050e59caa9dd99ed31e8f1b1b58e1d5ca53b47350efe29fc05f5fb7361b5002b
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..415569f94c0e8609bbaf1c50796bc40ad42a84d4
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:584e61c84e119d8a39dd83d5886312849a95972eb205c8d3d10936eb8fcfff16
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0151a9c15c8588af10174c9f1012009c1883a82b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b36966f6cc6c9dd67adc7d0f4d0fe8916aabb97cf4a47d68aa2f3eeeb5b3b649
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..84fa1641754df5e820de625e0351e0503d0874bb
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c44b0880a87e9a848c11401eee046b5426fe33e3adfaccae8a3885659c1acef
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..620a0ee987202b99a2c595cbb12de7772b0b83e8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7939bc70e9d1e206fa609c675ad438feb77631cd6282d91f47f93ed5a18dcef
+size 199058978
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f54a47380dbea660016d9dd3ea283ab60bec21eb
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a0ac72550376b981cc084c84e3a33faaa9b26cb77ce291862d11e697d23767b
+size 199058978
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2eeeddd535752f41606f63be981aafff878383fa
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd5a252e8a1b94ae8cdc758210ea7e2c7d07050be5067e538aef5c2b66dd95ce
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d07851580c1c99892248528c4c6258ce127c358c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e3601608801bcaace4fe0ab0a33fbef3423e89546e0e79742f93e67b8651dd1
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cae84e3c8febfd73b601fc04ffee5c76cd5f9032
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:943545488cc96a84d4d68359f7f6ee82ab6b603d51d2d2ca63e9f15bc8abe619
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..51927a1a9f84c952fff7ff30b9403052fec821e8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b40741d71eb335abb720ba31e0b3274573b2e7e7745c0558aa45fcf76f1e5cc
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11ef297c300788081fbfa7ec9d635aa2f137bec6
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d0161b2f5bf57f15f347283aedd2e079b2aaebe8a9f74ce164a7f9c3b888092
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e442b4fe5bcb636803b6ffe078c7a6b4b2f435c2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0e1b2ece5c00b5e42e1c50bc53dceb42314bd9cc323730106e0654008abbbc0
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a03660349113f09dfd6037416c1e6ea2455698a1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f813e494739847e8326456f05a340a189aeeceeb626fcd03a51c71a53c1aefb
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3911fedd6ec649bf6579fd2b9e2a692fb98d02ee
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3bcfd0bdd18eb503fdd0c6d522e23c214a47f22df1593797d9da6906f8356d7
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8a288ade76a7e069f0a4558e7dd89f9f1674fcd8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:268ceb702d710584426a2090bc7f8a4280ea2395482d6e98c6e97b9d505fa6cd
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4f21d511dca23977ba898ae7fd6760ecb8729e2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e867bf88caf56e704a2bd9c676dd86d3873215493ee8230743567e9b5820b39
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e789ac1b8d836e5f7e69e798035f88ca7d8cda15
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10fb6d053b6542937fe49232c654d162081548b690602a1aa6f67c22a6bd57d2
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22179034b80d2e375335979f7d7449fa1f4d4acf
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d13762c2e28b27c895d37d325ef60f5e50be6b35ab24d8385ca0213f7ac63fe2
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb8e37065fe28cc306510982f53aca0b28b455a3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af5a7b6bfd892a6910dd1cc8399d8d1d8be272160a76c48139c455589523c41d
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..20d2e990fd3038ee0a98cbe89eb8f7e63acaaa41
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3596d031733156a9d04002ddfd7ab56c52b0ef39c330502a2d7e7c3cce289878
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a8e38539b26d5ea7949c6e7921f0f7ea9ed5d87c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70afa8b351260cfb9607c884131e91e83eedbd2209c27a45a3aea3204b533ab3
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dada01a586f5af663855d29123d00389582046d7
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33fdae0de4eeb316c1e2db8837257f3bd8b9c63d29b1b7c092b43739b4df7f1a
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81bbe4fd33ec42d118ef5d836fbfd2929d5d8085
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:104adb33ab5b573a84894e4edec3e08ba2ff31adf12bd0b686f975a03977ba7e
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0fd3366cfd77c20edfb9f9f192f29ca44e09c089
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8b80737955dd6ad6e696ae4ae2b9e5fe15cd92ccced1dea3ca294d835f2407b
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f98efa114d977f6275ddd6ec2780b39327f4a74
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3613ca6c14d4318d159f442603b475b3c80d95a6cca913e7a54cc7d7d1cdc9ab
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c29ec78ebdaf1a059fb0ff77ec98af7f853d089f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff642b296cb802ecd6b73b56183038da1c94394442b6b5c7d0f8e332aea3c99c
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..87254d518b0765963892ab0978ecf0b26838674a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a14fc84ca6abf8df720c0543333563340bf8862c0c7ea643081495a88702288
+size 199058594
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..977df18110e7ad115fc9766bf9f39d4ce3cb7ce7
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e66eb99bb04b0fdfa1f233d8f7e642f3c50e5a8dd957206b3d76c3bee5ed280
+size 199058594
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..967f65610fd98fb8cadb17ae27090cace57107ce
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edc5621e2ca9692f382c9e7fba27d353568a6cef397a46347d31fd2398dbc808
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5e755e1ddb733674554da77647ee544caec1468
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24fc920e858dddcfa170a22aac3ffaa4aa2f86015e5546b95f42029d86831a8b
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..71b7ada51616a1ae2e38a60dc38096cac453ec72
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9727362ac1ac02e6d78868650f233117a684d9e13dc9bcf6944bdbc70fa6d09d
+size 199058711
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..48155f5c49998832789c5734f24c7aaf90a70995
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d536c3b65f803f97cba6728851c65b29ebc6f8069959a6fbd4e34b05f64df34
+size 199058711
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e4f5cf93aa67e26f9d370a25cf31be516c7f2d5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d9eb6730899883ea42dac5a8ee1fa700abd1bf7e8f1674715bff302cf8a3fb0
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f2942840b2f4e690e7b93667971185d8be3b5e3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5d8911aa245765b5855615b0bc971ed8834ef7c6a3ad6f6329605b474855298
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..886d5dcbc816d200788c91ddb3de12dddfb4603c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf1f51f68ea53e3448acad94ff1e49b4fb6493fad0c351eb62d26f03a20bf2ea
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e6c55c5cca08dc5e377caee51e723217a7daff8f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84e830b2c00cd6cd393832615b2a82c12f1e444ee6bf6ec82e2b769d2af23a90
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..62a944633d095586ad8ca2798487cb7a0b3cd7f5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ff1f717d6001d637e6b2fd6cf5d924aeb046512be180532725f4f98d5dc7085
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..689e4f70094c181dc6fb749f5fab4f60352284db
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ac6a761cf68af203dfd0458ad011ac93bb503ba254271d597431bb3fee6361a
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..33748c4cb4f8b68c202e1536c638c95679526648
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7e52c30faff0567867442aa43ea5da9790622517d994241d1ea6ef0c0c4a1
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d8ada61d28edfeb03b20e08bc88e85187757b06d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:debdc7224825e04726b6dd104db9f86b731e50197f0dd0e9c8260f89fd948e8c
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed357dd9cc882e950965c7c8731707bed158630e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2863cdcd35183c08a972d112e0b29d9007d7aaa2983a0f0199fc8ea1fe5c194
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..03833dad849472de50f59e0e686bcf4bde89bbe7
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:356695fe359addbc6226d154358dcd606e837e92d04117f71cb90ac1019a7811
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..514093f8e44fe7a2cfc624ef5da6a26f24b0667a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3105787cc19cb7883006d53c40873fa1851822b3434f6221ac5e92b7394fb9bc
+size 199058594
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49003434a51550d4478a153d24f184f548611b50
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0ccb81eae67188bd06f8507a28da094071509b4783ca6a03f7157a8abd8ed41
+size 199058594
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..872a90529fdcb4be1c053208520637d20164eb06
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba8196e236d5b3bc17cdacebae047acbe510f6cfffb454bf00bd1798f4b61df2
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d01fb296c8d7c68e14d71d78c36562ef5814c289
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d102896bc34b058614a6ba5025f94de8e44c56e7705c8e650cc834d16ef1fdfe
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c9da28cf4cabdfd18b12a6c72d9d3c08a2c722b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67ca09d097a450b62cbebc1c531ab04b404075e74c89a5af6515cf55b225fb16
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21a6f33ae9153744ae7e049172dc9c07d9f5f354
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:035db340b3de634a2a46ff3fc841b4175641f5f300be2df47c1506f39e8ee321
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a257f8fb4a961c089a68ecba821336c4310a6e09
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3cb5867fe2f9bbf7a065f01a0e90c98a5df14e1b5fd29337f79a11bc7e41f57
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..984089c8c873eb7b754e54534e55e72d016cb8b9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d72f398ce5d246c0c593952e7a734c7eae59ba7352da37c7f94b94210c49185
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8750a7b9209470fd49266d636b59285337645eb0
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a149ff0cdc1320daed9d44ee021cb832148b1ad730d7b824bcd7105d3f4d118
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d857e5af695d1fefc099af2e8f1a2a10faeb421b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b9c20a72c8c585397344324952867f8577d16e3c03ea21626b5a0d8d98a26b4
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..90a7fcea67b8252f5ff6017a7dba8a4ed8d362c5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8577ae688d371911db556fa131f532058fcd584fb0aa27f969b3204a6e82108f
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cadd9e96c3ef452004542d3d7410e218d39497ed
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78decd1926026916071f6a09ba1b5f6317f91fbfcfc3deae0667ff5317a640cf
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..107fce76e4912e82ca3aea212d6240e0a4189492
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:024f3ae73c495f2bb61dfb9e512993808aa5e42cd148e0be55388df9cd4f928f
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed3b9da9273e998173b26a9a29009f32748456ff
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b73a2d1a0bbaad92bd8a1bf0fbc94dcc0e813ff9abcc34ef26e03116e3daab8
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5d8283371f944d01be69f3d66e541e5efdf3c804
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e493691e4ce3af14faa3526987c1326cc0e05d1f7756ad21a202da7060df2f4
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed783e731a5a55d39c4eb7db851a22955906d1a0
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c651ac4884b33d9ad084beb9a476e26ebd929af59c6f7ff05a231861a14b7124
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c42591b4f7a7089092bf59b9227b663ee363e4b3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27e4a912f4ea6158829c242a8b52b7feb14ebccae0f846a6025fc5ca5f74a43c
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fd225c26607488713d5f4ccbb8bb03fa9e265e5d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7b9d07248ca07872d8c1b5d882aced18acc135ebee05e7ec0f31af749b49462
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb4c2e4802c5002d444eb57a92ddac9234def9dd
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:771af6b2d2c12500c99ca48a575219a4b8740e56b5fba961200e1102d6f96898
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4517ae625309233dc3db912bac2e8ff7176e4144
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6462a9f58db6fbc6ea0d127d52a13d24e72839b6f2d6381cb1da7c8c4fd7c301
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..622d571696777b1b79d3cc2a1ab25f6ec0d93a52
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:278837e803c49c2df8b3f62713d23ca537fa9bc350660beeaaefbd3ab83be89c
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f036e73a827063ede65055206bb2e7ff2805ecd2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c827f1527452c7d329d6703fa59ddc9dbb78926e38053d9564040f9c8f27caa2
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6d642cb1adf12515a7c69021bb7faba7a332fcd9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c04a3f036622f88e7d1dd521862bb0f1935b24613bd2458832a5743bf54f98e
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3beb686df3bb15f2ff791d87306464dea47ddc94
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18da90ab349e9642c2745fb53c7cc50a1f102fdaa5cc6a72cd6a615952675b19
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..daa5e991541996f45e008da660d3c3b526992454
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e64ab69482d86916347e64718011e01823137239f35a52c9634216de2a0e621
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..441a928833a8d6af7f9cc3c4b49786d04eea6550
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b1bd354d1093f041ad3d140409ea6d5633f3e6e4f4aee51a50c59d898f77696
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a128c4426461e2c623f6e904b8d175781a662f6f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f72e05c0b2b1b6f6763c24dc735a67d495e566607dfcc43d42c9426261966a83
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5561b7744bee84068380dc59349cd27a3c2422e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0fa909ffeec2ef104fe9aead30919aad6157727f3439985b01d399e631cc58f
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..33636b3a3f177453cea4b3a2c032342bb2ad7c39
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d046b961a7272deb8110f1c4accdcd1182115f4c30130c8fba38542accb4bb9
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1d754481f485ea7194c1eadb89a9b489718aed22
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b602899cf754cd29b0e290587cc6b4235df10324c432dd4977963f4ec9a3b04
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6490baaf74b607fe743dc5395987f19373f3ae51
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0297506b41bacc5558829bf17ec8cdbe262d3f3f39fd72cc4fc787a853debaeb
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea925f6c12d86e87b2e1822907372119a5478839
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:190b1cd30c390c58186768f244e1a79ce6fa4a2fdfd012b4476912469391eaaf
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9b6debd3cad52ddd22368eb7c2deed5de04961f3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d38f8d9cf16e4cd8e04cc30cf919c445a1159116152f2f62ece6aa38fac30d0f
+size 199058775
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f7379c51c69414206f987526fa8549f5dad70bf4
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a26cfeae26ba80ea1fd132de90221694ee247f3b00c7b9a7427db05b2e6ce387
+size 199058775
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..36df16646d3982d4c9c58476df78ef082761a3e8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f2b4e858906dad8b1b0e3d024d06132868c17d95555d768033d5cee4e083d95
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..828c25215683b596810ceda99d8a4db140973ff1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64d76423106743d8683a21eabc1dc56f051497df99c154bd2c87108dec2813b2
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..921f2ce11d89a43061f3b045a812f8ae3878f696
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:042f68eeecc3fe4eeb040feeffa90425a852c1fa234cd6e17d87eb40ff96d8fb
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0dc757dbb6defbc2af4fd40e9a1b642f25eb0f74
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d6386406d2bda26c9f9bbf965e718bf35e9ebf12e444c7877c5d0c174c3bf12
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0f1daeca3cd844de759d893db744a05afcce93b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60064e996b46211fb4cffcef6824c2f3c6a090bb667940530163be8acbf6d342
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e30e6c33fc278ce1c0954be43ff6b67e2015fe1d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19290a0c73a6f8023ef586f70773add934b156fe8eaa45d12fde37c8583dd5a6
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65040b19899f46b56d0dd5bb363ab65b9d91eb16
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7613a342e5bd67302d6c2a97f567fd91b150f43359cc2ec4051004ebf854ad84
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8ccd2bfea17a63911f5019468c8f998eb37a36fd
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:906a17a9bf5bbd75edd276c265fc6a9ebd352f17c20d67d6c4a024a5f27c766b
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..97a3aa8bb9f309dc6f1de051213f0c4ecee628e8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:330605eae164179b81c5a2914c0d8b1492496b5feec33fb6708d87cf2538d524
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..578e7c60fb60d1e2504f59f48fff595d35cce61f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6a5c62237380f8cd5110ae09d0f9bfd43224a4a3af08db6d0a42316fe1951c6
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3fb3aa06af76345b9b01645b71e6ecc72777df7d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2db96a53c131de91191211f95ee27025d121cf2d37d9555eccfb571ac1bb1092
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe63de57a3c5ecf719bb2888072547bc16629a70
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0fc4d580adf6f55d690ff43d6e31a1aacd8be8651f482e373c8bb81943fda0e
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..23b3162d81f8483ee6d6035257d1357f76964ed3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ceae6308465f86a9a8f9ac559b141e5efaad7f019c0c4937e8914420fbe91c0
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..46b10b6354c4e8d1cbfd8b020cc596534d3cc269
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a82669abf0a24d1516e7d14f4da650bd1e13919a30492d84f921de87a5e3c1c
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b68326d14ecc62f96c38053ffcfe2e066f7b0df
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91713f12c17c272b77937c14072060e3d909d8bd436412b142b181435d490108
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..434cda067b3ec2033b5ce2109f0f13a1fdacdbdf
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2441d3052c1ad97c51b422db12a696700f06e983c2f4fa391b2b7e8281d06e07
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe87b7eb6637df389670e26bf533ab6724a01bb2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8788b908de41fca5c9cf7dc90c0583cf182066ef944be710afaf66105a497aeb
+size 199058914
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..380e7ee3784be0e2332c461a41342be9680cf7f2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d341f95dd9eb240dc72671c137d3c010dc865a99f358b1caf8fb468317d435d
+size 199058914
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..599f658d92f66c726c2dfa5cfacf0c7ca979f471
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1524c17e070b5162069b1e61c3be7b28da62388b26fd1cbc6d614293ca20ae5f
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..adcfdde68b60789e06d2490c254f8ea68bb1f569
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42b68407f44ddb8c8913cb3dc6db46676d0a0ecefe9faef10fedff4282897dac
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4796013ecc73f794d839b90f79819ead235b2b2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa922a46730491e8a994e4feb680892eb60b85dfa91c35407169ee6dda6f94c7
+size 199058711
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..724916f9631ec6fd8df8fd2d26c1047e25335afc
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fdbae5e10091eb9710b1d08256a35d187e58e1610885bfd47b26bee30a4bb17
+size 199058711
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..efb00c27e2c53c11a26209d6f14ee3cd2ad1df63
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff92823999a4d3feea2d5bdf9ecd068ada2762d27cb3d5030accdd4e3b45dee0
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5bb55cbdab7893c4d29b02f40a8b01ff26269ba
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8baba019ecd19c56601a45d79eb2f2074b55ad93cbe18424ebcf0e7628b30505
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..358f7550d6c0a305b94d935e2bfa2d7032f6d270
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bf8a1a4e210b21a27b563eb04aa1512fc64ed16f37b484f619ee1e96ed0a365
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbedc9247168c784caeb6c1d84b22a04b26dd4a6
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1659ffaff25ea53170283c78744332fc3c15955231d490ca3e7e94e9a7840a
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..edbac9c2ab5ed6f041f11d5a2c64e0caffeb048f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75a6e13dcdaf2c57219c847d63cc36ee2d2b22dc906dc50244ccef934ac728af
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6b6ab4c758fcb9e02ed422644c082906bb5be19
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06e9505020f5cb11396c57b05be88e784f7e941706851bd1f935084c5eb7abd8
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6097c72f5beaa36143598b8a3d16f4eddb725e1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2f5a147255c4a0d3f865238d72a2c064bc1c64e8e995940216fcc9d86c8ae73
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..652ff2db28cea56d8af894014adaa956bcd66bb6
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15a007f02665c71325739862015b3992bb11546853eb43bb272473d84124cc39
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..74b0318df9462975b06bcf2fad17e21f0efeec55
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8297442823e6521e90cf1eadf73871679d5bcf41c350efaeea396fbdd39486e
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69d9f1697f31e681389ba70925f87be0ca5b299b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18df884605c7bbf4ffb2f4a280b4ea8ff8d1fa3ff9b79932a6ae356c7b6b9b3b
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5bcfd4d9e10fca33ff80cdea25f2a669098eafce
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ca5a0a8be3ff24d5381e27dc249ef7c0d9cbf4313c179a04975c17d2b09f0f
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d1ce3b887ac9ee0ec5412175255308fa67c2cc24
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f78a13aabf9f283c181bcb8f82f2abe07e5603fdc99a6e78a8d94d0c49601ae
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f09e0f2c0d997214b3dbacc9446727421d57bfa
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d445bbb1e43aefcc871fba46794818a4681ca0d75d79ef186de2ffd21605aa3
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..58d6629cc5c0af80ce8ba1db64d54a8b8dbb3447
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a17c9e87c4f011c4e4123f31a0f569c4486b1197bf0634c4b7ba534984ee98ee
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a055a9d928deaf87437b955a789d3f0aeaa91b84
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6232ac68e90b4f6a1d683396419a2165e2a1355d2be7d05ddc6b0301525d37f
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0a67c6eba7c5f1c08361ebf5b781617dff9faea
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b505fba35b33379fe6e4ccfd33db6b351c169686113993e859687334ab03f4e
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65537de474a9b65863e565eb90f5cef329681cb3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ef1ec8bd81dc46e53a04c4715afab65ceabba7dc1af1077d57f9b4bc0978819
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5be2a909679cb45ee38021a50f3a963333aaa01
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be8be3b3fb96d3d0e7c1a6d1960d41af2488f5cd5320c7201e241d6c699ba036
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b9bc3e376eabb05313661652f4da4fc4a3b4a46
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e94a30e8f599e46f985620dfe45fe98a92389d85960bf2e336e085bff855ec5
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1fbd963933906aaeb35b3606dc99493a299512a4
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64b49e28afd01965096f5e0724568bee5f8a8c31b405809ffc79e4e83f34f6da
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f4cc47a9adfb4a7e526061894447d047635168a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bd59519d4e3bfe9a2b8600e31e0d3677d39d7ccaf4225fe102877cbb1eaddac
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3561441ebfaa6b9048076211422eba43efd57be6
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:742beae819c7086f1a33324c1699a4199d48e5c606743b3dff6222bf8232d751
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..07f10c42dd972204e28b71f7853764207223f6a3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0d3be24c180cf67a360ed56921b78c5b84ebdd23873c82a7868a020a2a6fa60
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..562bd5484c96c635a3659ee91a77908cea326820
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a946efa50475e40301e32c2a1741112c9790166d3f46101bdb507b8e944498a
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..17543f424cf7bdae546e2997aacba0d84783b9bd
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c00f105de96461fdd26167344e12b8f7bef9bf7c3bfb97f8116859d9fdce93a
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..05d8c80cbb2dcef5abdfded666c7773ea1f6557d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c6cd6f08a4b1aafdf4e8b1ef9ce0a0db666c8662152fcdad8f2af1d29715aa3
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dfe24a78f20e6b4fd9e7f72a215bd76dc878bef5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b890c250e43788d67e63ecc943357860583a53ef401a7410e872fdcbcc0ce306
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f479e2beb9fc3c69cbec3eb9380443f8ea9560fd
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e775ed615152d4afe215be48963d1e4664c9aa566a6444bc8630860db30205f
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a76bf37021e3dabf653b420b46b2b0621292c3f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0992843ba68239d7ec3ca9d9471daec4d0210ca97fdbe34557a00f1087267821
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7071cebf7dbacd93b408ace85137d51a3a6d7214
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba2d99ed5ce00ce4516b43ba1e59dc3069394b08119060486937ab2fdcd45a26
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a7716759a3384512e78da09adb040d078843ca59
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d2c854f7affe1efd27cd1cbd1f6249702660242daee30af37a106513b2440bc
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..70e1076d8eb182dc1fbf4268d3173a0d1dd00bee
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d445315e812a1cba5c630439815b07afcba556c926f01c8926f85db531523468
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6b977e5b4b8748c52b73f47650da8c9cefce9004
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d72d5053638af9fe11404b88343e688b5fd7154f46ddb2ce12f1d60b43933b3d
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9bbee717bf2291839bfbff1e32ad8d6a4062c72d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c21aca8ccb986532ecca3d827d52de9a020f178c2d260882981c6ee34b288784
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52c827b74d556901ccab49e18789d4b0a08cafe7
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd627dd4b79157b850e09401b3540af4c26f1f6dbd23797284ba2b07f0740c5e
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8dedbf4a46b333c12d051951456f1aa3c6cb4e03
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7a128f796b5cb276a6f29d1f31e1afba6632821c22dfffbcb9ab20f38297a61
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cbdfc16aa91fbeecd1d4a9b354de54065b7f5a8a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7d8bdd4033a6dffdf352c9e2cfa4c69561eefa07ad15c8c1463df8ba9362a8a
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4c3769ba96882e62b4a662d79a7533a1c4c4781
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6769478dc77761fa1edef461cec2555855f87a4e11e26863ea287012c9015205
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8545cca365a5a8d7bb650b32ce06f5449d07d206
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4c67963f6d39b867bac28757daee6bb8d522cd7a925ddbaeebd4838e55c36ea
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ae994bb40d5aa2b728cdb675e4128072206d5f79
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c58f268a8c826bc13e070b4c030ebb81ae03646731f39b72e831a547ce3230db
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..482e185346d0d9e90125c413ad361eb8c9bfd32e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3de6bdecb6994296e1fa04d65d9640cae38565f0fe3f66868be00bca9c7a327
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..feb492fa86f0015f8c00d1ff2d93c67913ec943c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49a0731e8e4d4ba71e028ff2ec3fa3fb347590b9e47a90c71c6d44e3ded1713b
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb17815c31002f22054257877a964bb9755978cb
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d53220a56d37ca48b011ff5f8902c64ddd395b23d829adb05b4cc904972f3154
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..87eeb046c71e365fa4cb9f05bf3854c73670c9fe
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fffee8b39249416a2a5355b2267c954337ecccf931b949b381a3f86a2253b663
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..18c3e056b0ee43349254759e50acffc0891a7bc1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09f569759f6fad42b3e3d7d578655b7b2e2acab32b72a6b445b7bb88df9f66fe
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca80124e2fa0cdcf8fb8d52f9a827a4a4d3f1033
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d31023af04861c345f340128d8b43b758a7f16f90d1544d497f0550831752a7b
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5754f16b00efe4727243da743b800bfd3f23039
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47f366c8302c6da21ba5b1f2c020a58b2af627df7e82eaf52528b522b38b6103
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b7035a882dc3162336c01c3789cc3578bf9cc90e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14d66edd9053bf4c945394d9d938c9eca7c6d133752c89c92875fe7925ab2502
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..56ec4b3fd8d5eb9f2fc1b4a6bb84bb4f2662cd04
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:faebc8e48a12bbe0f0044941c0c39a00abf5dcfc77d05e6b0b6b6902dcdeff62
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e5d32fa543941fa54a5a7d1b1f2b74bedff4d4d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d2499882abe7963b01f25cc515b33693f70fe2ce9f689ad9d6750ec09b83019
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0d1ccdcceed15ef6218eb6e8729431dc435aaed3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5301328a7badcfe60248137105b858594e8566369598a4655dfeffb90226a8e
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb590efc3a362a4247b52383bb9ec7b3473cde03
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:500cc9025d2cbca2ff48a8fc98f497b1653693703beb22b7bd36f96e757c475d
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88866cdb7481692d1e9634617f4151b321e5ac95
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44fa70521109001a09020145e799f1afd3f76496b7386b50f26ba4d8774f868b
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4313d77c346ecc0e08ca488f10f90b2679821c6
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ac298b0eb0b01c595faeca653c912aa0b813244c6c07a5403cb33bddd35164c
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ec39ac4cd1e262fb43c3c8bd194deb6e9d799c9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f71969759e869bfa026e293a837f2010c7ff5d0ff069c863d3c454173aeb0ebe
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b560414f85ca18d7b54c7085a89295a3983af566
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:485f901309b81fbb5ac10b9f6ec943cd62554c5b8e72a13ff089c3af89390775
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..341de4e586441fb8f10c32753e800d7e5ac288a3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:358b3388a8851f7b204d4f37aa7b00fa0f01c2543ca692028b1792efb53d4904
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e12543a0eff1ffc91f1026c0d10bdb406fe7f6b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c2135722306cd588857a9367069824a13bdb7ddc388761da9d3fd7eafca84c4
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f937d032cbee26c3b00dfdd68d0ebb52b7745251
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8921034cbf26fe0aca54f9a418d2d9544f5990a79de70d326f5f4ba5e5d7007
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..786cd00cfddc3284869dbd59f0f8a3ecad05840c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ec587b396c603ca6fcbe5d1443f069eed4d96dacdb8eeae46aab006416be4f1
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe7211274aab372fb89cf48e53faa5087f642bd7
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5b76124aeb20b0493a9a356a5cf9bfb7b9f952ed84519adae67a1fd7065dcc
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25bc457c8ce51d4a019f8230fdce43e3f01de501
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:530e5d9b32e875ed983d99e9763dced6b7e9a9af9cec0fb693a54cf43a89eb64
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5991ad635f9f49726627ee55fb436c1fe03ef12e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af5271749cfa34f15d7c6b68a4e46b99dd748497b01d83f47a453579a2adb468
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..336a0978f93f8f8f8829f649fdcaec891a090aa3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9810bcc2a855d066e72442817f0b1c76fbb6bb7f75b4f17e68a14724785b2832
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed73c94279abc83f881035ccedb0b353f15fce89
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f93add12c3709b2ff3c18718f22d8d87fc513eab7dc1a41ba8d2c172355ba0
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..adcf7110df834f6ba0e2da23629261c50c8ff6d0
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7448f8050c411401e70095e359750b1efa398a3b16e9db849fbd96616723526
+size 199058647
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..245a909a627227454f300281a08c2e61002b4418
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e37faa6f7061e2103839292df7675c677a2396187edfcef475fe08adca0202ac
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c152a0835365faac9080f59b79f16fe9b6bc20fe
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95ec0f288fb1bbb2320f28dd8ccff09769816c2a3c9362d68ecb1e788dec3140
+size 199058850
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6768c12a76499c4aa59386674a0c0d0916f38a91
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5555111acc0283d8b605f4169ec5d8638c2d6668075c952f54c89e35334f9d9
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..421fa2ee908ce37b7159dc0ca3465142f5f94ee9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b596cbffedae7b6c8852bddd9e765f06e4e23311b6c933722ee4a1238a95a66f
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1904a9027732aa4d0bc0825055d628fb56e9b471
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0447ab1ac7f3611c861a511d1111fb68de0a629c510ba8c6f58be2254be1c58
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e285f66d47446e942a6a1e3cff8dc6afda8f1dc
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e9b5b008d276499f3d42b04b2fae85d01d68c07f339d7a8816725db1403ac8a
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1a841de117af5de459ceaec7447ddbba8bfb5ca2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6ea9766499b50fd0054131954a1502372af18d03aade9bee800dfa9976143a1
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..461c5991ce3bb98fdfdeea3d8562561e2748de79
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c06db8fc667d84d72b4106a55665e4b33ef8ece86121a033c9de7fe637c5bf3b
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68aea1e9981def97e1aa3b8bb4c9f2858a2cccff
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2254d0d83ec2da02a7ad0073e77c951eb576bda433bfb843582cd4e42412656a
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a267b7c75c75455377cc7f8110c7f1a5a30d3c04
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c94b14d58c223c34d4efc747291479dd6343a390e2f0cebfd72305569bcdcb20
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09550d8abd63346f0d032a274eabc356b01f2d70
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77a6eb47862bd280bfa0b444c008d418a6e0fda0c42d1db95542e0499a5086d9
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6968edbdc9b45a8131be34c841fddc184bd96d1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b7ac8345d3eae28c5ed949aaf0e06497b116ee5d79440dd1b091a77b683d9f9
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bdf3d5a227603f8e1ee9ec78adb976452774816e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33e5a6a149fa7d54863b22bbdc16ecc5bdf4300ebbc0adfe267df0ef5f0e4081
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5d2335502d0ea4158b926678713cabb950cdf464
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b68c29e3302c881aaec80ca7d50dbc142f95e1e96050ba5ac645f5f47a64301d
+size 199058722
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e0ec5d3e1290233c0a22d30db10ef9b0f1880451
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5273cc99b2b292a091a6dfcafd45b0857571bcf1055415da7401e13cf11cb2e9
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..009fa38a15600b9d036761d6200b7dc7fca9d6e4
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cddedf4f95edace433ff633de9ff2bf48de26db77687dc3552d59d02e99ee824
+size 199058786
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..67bda2e4d60f22d00fbe4a213982cf4ffbf912fd
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cda056a0741461e6fc59cfaf17aed700afebb36daebad2e58ef49f0fe9f8c169
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..85537a0b154fc12e8753b6dd1e3675614ca0aeed
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0fe71f229c2fa4f6d460a9dc7e24f95de2c2eb1a538426d76bdbbaa44b5cd59
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..75f80b896f9546d05ac803881fedf3bc268de463
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae90a50d237de58f43131319415e3204f0aa8ad79bceba0594ee88ced1454b6a
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..44afc7a03aef7b9d1ada84b27132d0604273842f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6cd7eca7a842498ba49add22671d0f20705556f4690939d165d9693513a2ba3
+size 199058658
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9313fed3cec42ee09d42e7707ac17003bff51628
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:407f87e179eac8a3002487dbad09c7daaf6e06107f1b5269f5313b0bef625a3c
+size 199058839
diff --git a/4b284b21boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b357ec5a627b168de480146d07c041f5b6c69a3c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1243f5de0ecd06c51b9443e4f15c0e88d30adee74b025eb5b2207c142be987a3
+size 199058839
diff --git a/4b284b21boscar/global_step80108/layer_01-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_01-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b8fddf772c65166b0dfba9eb8a2eaf37fbd324d1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_01-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58a7fd60efa56eb05c81cd6d3b04f44f12b084f8531da37ce721fda4cd61ff46
+size 167511299
diff --git a/4b284b21boscar/global_step80108/layer_01-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_01-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b84dd2feb2a152cda8c597f1df5f323ead166631
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_01-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:932d7c364c6d9519fc3039b8a854fa6344d588b0bad1f8212943cbea347d4117
+size 167511299
diff --git a/4b284b21boscar/global_step80108/layer_03-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_03-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbcbf78f4de5981d857890974c2e22097669afcb
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_03-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e850ae76cebc212377e287a962fd747cb8d8260f9914ceb7a984d0a12c5c926
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_03-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_03-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9388a006a5487b6e6720f0cf604ba239d8e28304
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_03-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8367df0038acd8e1819c21480bf2703c4feee9c26cd201430d720905c36af1f
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_04-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_04-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b8a216cd079ac0ece3c625017323ea7fb9120698
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_04-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39c3038d5a7a7fd7048300094c893ed05964a79338b8d27b4364c78a7e016482
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_04-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_04-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..80bdd2c3e16f0dc32131201f4b452a417ba933a3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_04-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdc11c5a54b33078e829d61a40751634ee2efc14e7ce22bbd8d290fdd4c97460
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_05-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_05-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..026b2e7a00658d4cafadb4ec8f58cb59835a6fcd
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_05-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb24d98367fbf5f350cea3799675bf7367c10f77f041c2548c951cb98b0369b5
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_05-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_05-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..48f203e873580b3791bbfa27274668721dc2c91c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_05-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:623717d03e8510d0be78672bd7d5b0959facbd7acfb0b33d26f22be856dcf533
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_06-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_06-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a649662fdd5c1a4a758a7c70ec87078668774b55
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_06-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:648ac993e54c110d3fc542389c9d776a5f40c49067bdcec75724cbdfe6e668c4
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_06-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_06-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..498abf4996a49ea06bcac3f93632c31f3e55523d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_06-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1d5047efc900fded56ebd5939863353f856471e67ec5c10e074204683fd9960
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_07-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_07-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c17efb3ef67f5b2a036dc0cf19ae481eb3dc71ce
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_07-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8598474c6aa7888aff5862710606827e996b94e4e86cbfcf34df6c23d596620a
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_07-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_07-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..72b211aa565b9f396b4a35aa51ce49cc9164345a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_07-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b74ed642453a4d4674e185095dc4d3141a1d01622d658501ad0b08548a2542c
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_08-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_08-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65d3e8664efec5782e559014c1da0b538a706b5c
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_08-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f185eb2da2fb48946e4d5e0066ff116eed8d9b047a7af4e860d09ee7bd015003
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_08-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_08-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..038f0c18e38ee7cfc4fb224e60e7594c7d9aba07
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_08-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5427324306b9559cad95bff2fcc3508255e978623e9f744e84030e8789999b26
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_09-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_09-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6a9be96374649d1e5c305f1a89fd19a2cfd5594
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_09-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1db4466fe7d9eacf68e650806311f78c7704e3921699200b34eb71197a62c4bc
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_09-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_09-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..45ba4efc6976f723a8057077972944dadd312baf
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_09-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b4aea4a7f9f8f60681b6ca0a3bb0508f8d4acb572b646144470a06e828047d9
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_10-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_10-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4366539bcd14261a62e711805eace24a5ad87622
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_10-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:299201bd6e9e1ed5861f41b842eff8bf676748bf9eb81a5130cb83036849f048
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_10-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_10-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7326420a2cb10aca8a79037fd3763044cdc1f51e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_10-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5fb8b01fcbc201a7cb2143e9b04475e86ea33bce5c5b085dd0e92d66516e8d
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_11-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_11-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01fea7565c200a08aa205ab3841cabc487fe207d
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_11-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21d8e34eb409b0ec1a73267fb8f2f8f714e00036e267ccc442d93af154a1cfcc
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_11-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_11-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9a973bd8d7d6e62b9b4cb9d68990510ee61a7904
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_11-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e31f37c322da46aed85dca382436307fd000f13d7d0c4f09102be75ae507c56
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_12-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_12-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed2f0d6f940868d67a7a029510b34ef7f6dfe41e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_12-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6052b5e5440ca8ad1a8ee2d3a254928d772a43033851e7669628e949dfe62864
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_12-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_12-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..666ce36a8c17bf07c9a4285b85c0f972586b4562
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_12-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be79ef8ce54e23d32ad62e5ec8fb1d9a16f373d8f5f80b920b429112422b81a1
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_13-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_13-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc4a833a59ace4f9cc0b3568a007d2d10e8a3265
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_13-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b1e786033259e1d60240b4ff13321fc6872015e8b934791f2d94a03f412e0fc
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_13-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_13-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c715e80853245cb0779b81d0768a6ffbc03ad58
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_13-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1af84c300b449a63362f55530a7748871227f287d4af99bf820c15fc2ca42c24
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_14-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_14-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eeade74b6d8be8ea45e83b22e3195ed8b2ffdd74
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_14-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:695bc0e101cf979fe634e4bbf4c4ef55117301a715547caed1a488aef7899d04
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_14-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_14-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2c9c3e3d13cf1897d3236f649b763d31ae0b3d6e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_14-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac4ec57c4bfda2c5fb56cd7356a8d24c053f091dcf1de0c1446cfdb716f21ddc
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_15-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_15-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c57694eabc38beada4d342c060ca24601f7ee310
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_15-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5a468b62416296da8c50508ec0770ae407b0396f757336687bd92bfd4d8c81f
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_15-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_15-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..919a7d4771d837774c84d5134884ee8a8228db72
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_15-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9ac9bb1fd15d2e61dd791296d5aba76329d840667dac8ab80f8f7aaffe60068
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_16-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_16-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b417e167cd254bd475c8bc0b0a69d4609a57f461
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_16-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a82d7aa5692d313f97df0feb37624c7dfdb08633667df170fe22b450df69b06
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_16-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_16-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b08875b91933184b2804a05ba25ba5ae052affd
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_16-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce3a750f1b8d6bedb4db4214b77e651552eaa7695b115850a30591d888d1e6c4
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_17-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_17-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f069c5335804d64206f488dfdc8b3a29a05326c3
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_17-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0647607e3d87c2ed90204c9922abb3237f89d507482c7c7b3bc35b9d2563e2c3
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_17-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_17-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4478213d08232333ad8b20ba9de2fd154c166f3a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_17-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:719fafc9fad96d35be657a151ff57654cc49482abb16f4ca9f864b97f78b2562
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_18-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_18-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..06ee5b95148fe884eeaa35463ca80fbc40b2c8e5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_18-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0be1d50ac502f02aa9f19e88b7d22f8bbcd040e0c461730c2e0fede06af959e7
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_18-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_18-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..035b121977c6ab7e6fbf3f6451f8949a66c90f5b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_18-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99b45ff3519b09bd60eb209d8d0f41b73f4f0f190f6d29c8ac5c9cda635c5a17
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_19-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_19-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4590d0f24b4cee2f4d1bcdd45744bdba7df02504
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_19-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f7f8f80dbc314fd813fc3ee78b9f3c3887dbdb091829dff8b2d3baa5ce16dd1
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_19-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_19-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dce315a7e79b9224e213aa5569cbb777975001a7
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_19-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bce906181c0f17028e19e4e3d0684aa14d7cf88d892b1d6dc7ebdc313e56178
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_20-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_20-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..403297f24e105fbb617e7ce5575bab4371417823
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_20-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29e83330c980d2cf4835f979e11083386d296f850fceab4e8bda0401dbd2d580
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_20-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_20-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9052d0defaffcecb351d4259e8637a6366651caf
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_20-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:613309bd205b6894789230c4f72a8d3fbe05de3940b1c0c73acb9d97706837bd
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_21-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_21-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb6442f118a1ff747204e3fe3a9f94281d4ece9a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_21-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63aed36186d4555b7c998e8dba27aae26104bd2efa677ac81563c4af7f2087cb
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_21-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_21-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc21334805fcba852469be0139120329790602bc
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_21-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f2e16c982536298e172fe9205b1d51c7f8c9f88962aa984a84a14929d77eb13
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_22-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_22-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e70fdf67b2da335b20f81980bb4792ab9a4f3ae2
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_22-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3725b6f7313c2b4d0b24aa86b14ca5b8bbf018d0cbc8340b2f3f7ada4ca75d86
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_22-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_22-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b24ac8becaa53561ba6d591926347c62cfa3ef06
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_22-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:825e9517707bf3f02531692cd2956d046a2be41da07f913cbc328950d918aaec
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_23-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_23-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0fec916c6978610e215a6b13ab6ac8dc5b89961e
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_23-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55c20ce39dc0e19b05d677f8b8bdd567a76635baa43be1bdfcd22906ddb04b9e
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_23-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_23-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b77dcbb971196ea109df1c9d42d4af2abdc336e4
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_23-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b805f3ed96bb5213535484561bba1005af419d21065db408636979f87b1d9196
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_24-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_24-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e82e7e73ed47a601fa48454a20d04f287a507d86
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_24-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5b5be776aa416b8d4afa5235392beb4b08430d58138d05bbe1418216b60d86c
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_24-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_24-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21310f6ee4d8217f54131052a9f23a571a65bac1
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_24-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3593f561d44380678661a5b34737b9691b2e3fc7daab8bc14cac088886115be
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_25-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_25-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..76655bd2583b58f16b9884878a36a8522cbc5452
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_25-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91f3fa8e2a84175c1476e982bb0546d781c3efed7cf5bb9e97b5db083340dee2
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_25-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_25-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf69de1fd2ecd3bf303e25374cae0d7b440fd513
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_25-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa0a4a6c4ca1069b13e7929874f8a031d5789465a449811c824f23eaa3f4a78e
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_26-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_26-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..baf0d09d18d05340b475779bd958465722f48a03
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_26-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:323b0f4314e2e3ba6fc88c1e0d34152f87ae6c2a3353f572b2ff3288f4449e36
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_26-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_26-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f2ac036cb7def4a8e5cf79cc89c94829d44adb37
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_26-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:025d4914d10ce4956c966d03185bc6512001d97e62248e6af56e6097b370aa31
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_27-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_27-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11382133a142a01052da9e6f4d95f03c92d491a9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_27-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feea421e791fffdc70545ad7d17015d0c09bafad2285fb6a25eee34586896cf9
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_27-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_27-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d821982ed2c586dd64ee526d65aa96337a1bf606
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_27-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3153c98bd239c4385d223956990d4c63f22271aad70b238d539062d63d4122fa
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_28-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_28-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4e4d5f7cf6d8fdcf3f8e44460bc0f415ed94b6f0
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_28-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:887bbeba34a3c329d1a4917fe3d32f340396ebf8fa49bc85516075a9f9d16d74
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_28-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_28-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fcd7756bd31b012c0285447e6405db592fa7c0fb
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_28-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:068d2460507476c3566e102d7acefe83572fa0333f5c50bea8324d5eb80e9cc3
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_29-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_29-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..461acd74692ecd228e3e334a14097eaabd7c8e04
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_29-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2625ab6dfdd1c77f287d11f44884382d8ac70ec61ecc8889e93c6b57cffd5f50
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_29-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_29-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5b058e53adbebd947160dba455b8960f26291f27
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_29-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:116dbee1641fa02cbbea7c81ed6e43268e551c894a8cd8faf444afd83ed6e6cf
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_30-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_30-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d4ee559eeb15ad8d9b3e6065a414bf4464049f6f
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_30-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dc4b55e5c938441467714b9f098bb3da36a7bf5b2fc359144700ded459df916
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_30-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_30-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f4477d9e09e98b38a6ba4c14766134fd6e86788
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_30-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:058bdc31073a73aba75eaefa36de6d92aae1a3de6782683dd9e54eac7ebd1a42
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_31-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_31-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d8770357a10c8e690e3a76c3e5ce1e0c7333ffb5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_31-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d0a1baa083d48a5119a3177285bde118298ad85efbc3b1edfe7c1c147f0a5a7
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_31-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_31-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1a9212939af5223c9a4c2adb6cc2e5e8fc8ea45a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_31-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e1309096f5683fbd5c7d4ac7b4dc4cedfd5b7f770a9538faf9b9a41f9a353c6
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_32-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_32-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2527308a1a3c091b7506f2eb137a5056c8fe863
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_32-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dce0a3bc0e51260ebd5a286a47ff11b6238eae295f306a44405d8955c9c07c47
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_32-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_32-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ec34c741dc21554a82865241bdaaf01c4e1e1fee
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_32-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2d4cc2a853bfd897bb320f69dfbc70e7a8cd5acfcafacc9d21ee99457d498b6
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_33-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_33-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4d64a58731fabda0b313da0ea3be25a6d55dc481
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_33-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b15cf57c5f937e579009ad0180ead0046b055ca1bdc526caee20236319ec97d2
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_33-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_33-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6b7f7cfbf7a773a89d89dfa80e5632e40139b20b
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_33-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38488273293cda3e2dead89dd872c45d7ee0ddfc96066da8d2571b31352aff2e
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_34-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_34-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..31e069c87c6b0d9e76f719dcc8c4db60a4c88b36
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_34-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7898f12edb2600d54672827ec1aa38c5c83db6dfd473fb6b188026fe225c97ba
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_34-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_34-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3d66cdc7cac76dc2640b13504849cc1339f9ffa7
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_34-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9acd67552fef8f10f1f518fcf8ca8dcfbaa35f3e139fa4abe965a52c7a8387a
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_35-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_35-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2e9e84c016022400a54b438eaa92712cf921222
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_35-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9141531084c5fe407a0c8ec916f6d68459661238b175e75870152c55126bc6da
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_35-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_35-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..51b870c4505fd259514ada2e3fa0c7a9d09a5587
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_35-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:734a7f5ff35d1dce2e54b5bea2326940915b612eb99df387f9c18a43ff79b473
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_36-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_36-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..263053de3408f8db6e572dd50b75e9e1fcea9308
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_36-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:664fd2c24283d0c3fdd027c421108f705eaff536e608e868a3f20ccb3acd03e8
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_36-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_36-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..276aa8c2d70af2fbcd8ed8478aaf297ffbcc6da0
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_36-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4458b1391e0dc8502e00048aec91c5a173d86049bc412459d3ede26c6755d35a
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_37-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_37-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ff4b61c79b43337ece00826bcb8b41e30638fdb5
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_37-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b3d7c1f034af1c7c136d12fc63345eee2ad05bf967c28ca262de6fec3e2491e
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_37-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_37-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ba029e865c058a20b648c2090685f26b241327bb
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_37-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05c9bff29068a8306e958fce19ed01e1d0cd85a98872abc459d97d8352b686a2
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_38-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_38-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c3cea78ff067433c0e936d24ea4d417360d0f1b9
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_38-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74bfcd302698db47816a51afa4cf6412995bc2beb4dc0d5cbbfdcafc602f31c6
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_38-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_38-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bedd4397972d5686961dc8eff5e0c5bd249ba47a
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_38-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65fc2e3d1bc8b62981e87e83cf2b6cc714c36348c3a3bafbec8ee7511309432d
+size 113308931
diff --git a/4b284b21boscar/global_step80108/layer_40-model_00-model_states.pt b/4b284b21boscar/global_step80108/layer_40-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d9c3c399a260150e0c403a4fe4c6c6de899df347
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_40-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91a0810af3e586b5457321e5db827ae983265d3d7b03b2565c1306f434f78aa5
+size 13507
diff --git a/4b284b21boscar/global_step80108/layer_40-model_01-model_states.pt b/4b284b21boscar/global_step80108/layer_40-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4e83384854169497d67e4a4654e9117396311fce
--- /dev/null
+++ b/4b284b21boscar/global_step80108/layer_40-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbfb5fe5b683b01066878ecf12773a8fdbfe4f21a71d14b4528e2b1cb4bd58e1
+size 13507
diff --git a/4b284b21boscar/global_step80108/mp_rank_00_model_states.pt b/4b284b21boscar/global_step80108/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9cc0d7b11ba9f5e7d66bb766d6560bf598a3fa82
--- /dev/null
+++ b/4b284b21boscar/global_step80108/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b72eb90f43e22906b8660bfd60519958b0df00d5bf38893a8a07744d78e7559
+size 51635
diff --git a/4b284b21boscar/global_step80108/mp_rank_01_model_states.pt b/4b284b21boscar/global_step80108/mp_rank_01_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3af07fd7d395f7839e972d350185c1a4d7fa97d8
--- /dev/null
+++ b/4b284b21boscar/global_step80108/mp_rank_01_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41949c9d4dd597e2a6fcd7916695cb53cb392e42d296aab998de4c71edd62726
+size 51635
diff --git a/4b284b21boscar/transformers/config.json b/4b284b21boscar/transformers/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba
--- /dev/null
+++ b/4b284b21boscar/transformers/config.json
@@ -0,0 +1 @@
+{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"}
\ No newline at end of file
diff --git a/4b284b21boscar/transformers/pytorch_model.bin b/4b284b21boscar/transformers/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0c782d926fc792bba06d957eb1703c05ab40e785
--- /dev/null
+++ b/4b284b21boscar/transformers/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:056a3374b731be749eba5bbffba7fd035d172a4f6e17b78512c0b3af321876b8
+size 8781203669
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6d58b068ae5bbf435ff737757d51e6bf873e8a02
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3236606671689831, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03434728960928616}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07548519902353701, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027103393867684133}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.27434436630785664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005190735227370315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.09843971445320344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020285748309311217}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03280247893554901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015645885841163746}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1283696309182241, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032411037301123703}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.044864078248862495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012327331599678007}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07235248106657086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002585891559293976}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.26594645785931426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005067281777094968}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.09452977794835304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019058829105986163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07204958395594553, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00260168307386312}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2598611687840442, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004868835623139477}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.09372697520153284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019257190223106488}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cadca7514b231c61b82a21a1a691e5e1f98029ec
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5303952344866372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03866628930999967}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.13500481273386208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004456041975528417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3050209335340558, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005019779051030469}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15526776951779941, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037112903723659145}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06800836209907518, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029183003453498208}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15616590508602413, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0035788033957169676}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07803113267822599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024911371039099254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12100254673151754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038758351195453834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.28545106347976323, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004617233759164798}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1409133691299396, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003170026813140542}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.12390505105203532, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003973176754882277}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.28858761694209856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004669345842896859}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.14369743370526267, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003262924473269048}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2752204fa38df9df9df983d3c9b0e0f04d8a2e0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6963169812480917, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.025883064453583102}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.1792256974533611, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005365719346439527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.34488184545131717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004970214155406786}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.19513258665465127, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0043495841141288575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.0957315113405197, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0035283155722927633}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1834475236414602, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037698051706010482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.10282617536971313, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002972081492632468}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.15875527177897067, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004689203205430283}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31884399599505703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004581869680066843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1746470290433306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003733649528603583}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.16348748083622414, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004839999383468051}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3238241924312652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0046243997645253}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.17894683258352004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003839954827867676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a943f8e633a10bb52fa6b1e5d5af4dc0688ed8a3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.8274667802488902, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.042366964185806814}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.18955004198888442, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005662094291901046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3533218112336572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004960841139206212}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.20403726404125458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004566219372488845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.10477428578899617, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003935236903263568}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1901585058979038, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038815093874070626}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.11040571352169745, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0032753908389439508}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.16706159999567868, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004913635515279735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.32629206613585876, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004527254280455867}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.18223468887125507, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003924418368935964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.17216966052247287, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005091094356122184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3313273065171706, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045736991609824015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.18670844563664446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0040383704320243125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d65997f0f4859276bafc1e2151a3ebb519530fa
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.9591911553106375, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05993639067207907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.1924412284693008, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005450604205447632}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3705586639190707, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005109363173845363}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.2120064135279467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004550653208173271}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.10648010653830348, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00377122887055479}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.2022148169333304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003998276489277708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.11494130887956767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0032162324276510928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.16950864458355333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004740831041491813}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.34083637585044774, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004631727723114902}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.18886918285090654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038962288174863224}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.17511108437715225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004905333361798361}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.34672946562667306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00468590709544727}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1940229701192449, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00401611332459292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..308c1d0fd059b91288685f79a60376794febbcff
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 1.0660500946745477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.057054038562455854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.2143907231679788, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00587683477313044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.37935769424464766, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005114542762979336}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.22541411583738258, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0046964527830320015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.11999776807861791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0040878065570314984}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20846703611506925, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004052332137679142}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.12304296607701624, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0033297218439898735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.1874233346433946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0050850214855498194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3480461100730932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004676777395335985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.19952305726552913, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004002247434870572}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.19460128268928695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005298127155329163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3552748425353144, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004735986385918885}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.2058454997784591, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004150385211942694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..15a36a2d217e15871a95142b07961cd42ac4cd55
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.0346205955689389, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005387702896022102}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.2609393105105274, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002297796887792241}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.05919431140554899, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008128532722581225}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.0016947079558272144, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00013522786800163755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.015091208295292367, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010023028590081659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.002924238155455059, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002204715259212068}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.03444508009180932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005126959545386834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.2604556924140456, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002268199224560253}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.05894324375481446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007794281990642839}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.02388328793589735, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00037394497845618594}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.19000592468238123, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017654293062185674}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.04104954287102164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005608202683646394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.0446593937662798, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01212322535936079}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7605a22ab26ab6c3be9e87856163a415b760988b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5057362328292612, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006433371408967441}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3989876838504679, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004870545370765362}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.3936404222340052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004489407812168967}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.26705886387883326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005229555960716035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.20484037138370584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004083553885711203}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.20245301477178956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038618279740674083}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.42025595395161747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0057680756258974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.3333446990971047, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004337074043374792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3258665538462602, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003965783684207691}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4457444624630892, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005979013232422238}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.350619501110215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004439950958674579}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.3445604185348635, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004072911701143514}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 7.705805240318873, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.39271445746759187}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ed0929eda31b548e5b6733c15cdd3ab18df42e1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.630988373148268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005562228139019159}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4799246477199157, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004778694197234316}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.500240361458831, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004172644527560273}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.37181396597783567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005112789501488624}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.27856393096329507, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004222748696771387}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.28897489966738155, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0039448683747204324}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5255685123033286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0052326684310782185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.40098528853058324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004349425470384873}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.41542346879767855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038486111843277224}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5562724380069954, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005355340858219827}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4198502948499723, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004340065004215543}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4374377646795759, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038445750973011483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 13.13450054463073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.24192164477015318}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3153a0ddfd259a3c6ed80e4a4776ddb88be6293
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6493064131306431, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005206458394255086}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4892615986438097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00478082669472797}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.517895420103728, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00396951690057282}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3862528888679162, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005098489988602744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.28597309969092344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0043072728014668035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.30171853631179346, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0039958865212606455}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.543624778660909, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005057179222803841}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.40842201733115513, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004421919280806686}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4309807601003817, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037811422688738315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5754756758954318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005127760313090274}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.429002529676783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004411864254996235}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4548491812792949, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037688985484623095}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 14.068982889672569, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2968197877026963}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f9089cfb762c192d59b4322f91c93259a75998b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6543123312997964, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00526064036341449}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.49725103123595865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004700396819130318}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5246301036110124, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003984503778317276}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.39287572396976433, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005092635515157949}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.29395189402830646, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004329936715009244}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.3084715156757791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003984840568000891}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5442321756917496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005097259696532552}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.41399018562464335, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004362821518175387}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.43472057623746324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003783613140791061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.576260208132338, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005175720678552694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.43462623702782055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004330850511626005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.45859263436626035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037604784900039713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 14.5257757902083, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.27934070667678}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ae9d6c3c1ed22ceb41cd78ab0dfa6d42f2dd69b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6552722187848778, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005121690444471356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5014318839163389, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004841314732312051}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.528998961079145, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0040111986437856685}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3954692599951466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005101898040784152}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2975293972498264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004380627560293482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.312329889253345, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004020073838458014}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5462714187744488, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004929822164408306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4197087461962633, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004508347488373704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.44005743632484523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037961990367279827}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5782315118963006, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004987944243191088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4410697720893144, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004507508365954698}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4641416559814427, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037529394806257313}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 14.647692432598681, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33108061636277397}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..06be2a4284c195a4829891b25b4913428a8a08e3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.4175225760583597, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0396891909821717}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.036154702825363164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009818080564093425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.25194575603722763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004063756391959949}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.05913240499474725, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013929493617710806}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.008622739259285814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00043823119869464177}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.055070843900621946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024967396779178577}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.014032525290363921, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006860882714756988}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.03319248736743588, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008139270777232365}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.24076860761446003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037075484547642234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.054720707618427684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011645928856203625}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.028597579926765544, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008872776325204384}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.19465530426339228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036663974513027695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.04627650881911135, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001242868453881384}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1c7e42000f7293435924660f310d4d16ee5eadc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 9.081272600699627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.402018320394695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.5220382053968772, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005987738267175794}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4191646724726185, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048305024526799824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.41551598976024473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004296023954233353}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.2775801888317173, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004986567944186219}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.21926871203257606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003987182862859094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.21578393575171176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00370107800929059}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.43396513758122374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005410511105027679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.34987365551609134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0043112716180312175}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.34407708842598816, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003816657058260445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.4610543113184002, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005584357166355635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.36912334523668233, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004405007112813828}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.364590803961402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038882985510075604}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ea46acf3e4008752d32d640bebcfca861e22d2a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 12.827723467368122, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2395104136983248}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6429253445241435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0052682241566706514}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.47000703561019436, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00471475669926654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5027035099281697, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003999041411432061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.37759250826943214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005149133097785554}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2707511593118337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004165481663439759}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.2884428300738137, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003938627847598879}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5371248445521856, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005058812589328822}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.3926829496172771, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004369603703165467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.41815460445037017, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037834517040888243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5661267893996395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005155408397958708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4111001137384524, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004388595491116961}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.43908602759039184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003769542857471681}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7287b7a6ede36473b2e5ee1287ed42c28d33be2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 13.74983515740776, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3369336643179774}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6663828985557976, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004969343558059818}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.48515678680621765, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004772500474063418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5239717326093746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003941772216734925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.39983932213666673, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004999905968285044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.28561981827808197, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004259780327670949}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.3080090265553984, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0039734279794513645}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5573959843203435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004875599891523969}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4039560399628867, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004427731556101224}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.43567806892379524, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037887038508846284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5892685091559403, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004925427939501413}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4242330575924514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004379788059459432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4592396211258998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037316704654461576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e853bfcb36e766dd8e108e2bebc926a8372c6843
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.110843003647567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2511645721431234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6682856998360935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004979185134800539}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4905544087754697, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00468108576365346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5296487843403124, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003877700877449371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.4021464811436638, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005072504377728981}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29219879155961126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004343402687570785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.31329482376051826, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004044426728834422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5565192816989143, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004900320383045677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.40974574613193676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004412607746565604}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4403844333395566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003798990173365922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5890267104817443, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004933639417706148}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.42919808145201893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004343745851162966}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.46355424353344893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003709534998103682}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b4cb3067a762476e35ae2cb0af6d2a27538a6e4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 14.865246851977174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2469951733822034}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.665414040691734, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005062732785653125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4948589439641179, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0048064787252879194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5305017223370667, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003988654528152028}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.4046994581842311, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005111935694290756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29461815114988565, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004314404789364512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.31528241951341845, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004000144045412803}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5571982616459412, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004943046835934418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4138241225793445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004468742674023455}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4421868232457002, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003818602071050544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5893685655133948, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004992290666708423}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.435144330111697, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00447532683122944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.46632191073623974, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037788576614651527}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..158ff7d3253180bca5556b5bffa9acc0df673d2d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.041325640272890894, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002560445046946543}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.19041477540959695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004524197400485002}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.04935966890223983, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013619520000937328}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.007548030574657656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007170032082943574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.04879535050018004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025350606449171076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.011630773862436284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007179556558495777}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.04009329623073503, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025173115168095183}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.1861979765601121, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004396398136666792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.04776011784545986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012826838220057924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.03614944318759725, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002508457252552527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.15867343894687855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038860218085092343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.04098044832889336, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001174299018944812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.18086054928747558, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01999596863932189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e2626822aaab7563a2cad25a5bb31c73e8d34f9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.48796554161260514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006037067740666733}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.42982516412058086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005042143982997805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.40276036162942674, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00447207097628362}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.24998961495361918, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004829698568164138}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.21846457768780272, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004069803208873997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.20232455371188282, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0037130871793239284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.4013731521366426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005316473393809085}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.35652193084587386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004474375971994101}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.33023329964113135, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003891418836388668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.42696683460990426, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005554491153938951}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.37572008067001184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004595101885912417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.35033460066099453, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004035051365542008}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 6.34872820947942, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33668537119149733}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..559f0e920ff41c228156766f546b835e7dba7f83
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6220966532720777, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00556098058005664}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.491700916790073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004899266870189194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.49922840734492113, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00421342033950754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.35157330398460046, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005016170583236788}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2757523128127938, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004283617860672476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.2773940114781611, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0039115931012414}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5072542016868399, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005141659470235974}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.40406132924633337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004458244442063251}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.40603118525029447, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038085025798868743}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5397172744667709, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005262171032735078}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.42514118811350743, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044675186943622935}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4300464517574411, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038309247796264817}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 11.432615162808808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.6195135716025121}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d4ea5ffa6b9952bc5712f3372cd406a1065e31c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6424519756213152, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005185084062837668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.4975396288082743, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00486795443028984}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5182163799306218, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004028853507439458}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.37021817845747296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00491121715575853}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2855540858782935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004340737622712597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.29522310783911515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00398082279125943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5259444546489896, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00491487662373786}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.40982402267867996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004483163518003983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4239367573766619, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00379828921652199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5588298068318703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005000500827487751}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.43118988673283126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044827837991875796}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.44843024559472394, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0037854444143360735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 13.427423242082552, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21378430345614655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cdba684d7b9e252a874932a2baf09441503cb3b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6536854686117716, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005055038024884472}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.49481801273909026, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004774469373884207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5229740272347854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038958904995235783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.37832323284536284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004895191350276094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.28403935935102725, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004268699115858015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.2971667864152805, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003870964512172995}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5331243023450667, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004788026817581983}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4068796874397253, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004415817647854088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4266743694870892, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0036930292492481412}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5679878693884904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004880518673065624}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4294770442196603, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004389909285885383}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4524753167873619, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036313591386608456}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 13.213076094213868, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1911365124809486}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..267b8622806ae9a0695d9a85b74422a1c49d283e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6562302674293572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004978213958757384}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.4993162400355719, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004928232006071223}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5268008342959368, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003941366188303182}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.38281434184302715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004993242918072653}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2884911653993333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004386988321702633}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.30184258420473004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003973705493904007}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5369344693567049, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004792429508511601}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4092001227605644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004488279114524377}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4297537206934182, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003720711472388013}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5723322848404121, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00485312902623496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.43323580321413074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004506713763132942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.45631464166756985, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036619891881959094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 13.591065355686462, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.33251846754741937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f8b8169b4bec0621d63bcf6772b0c4e4d5b9f12
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.06529894179611363, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016701036636380111}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.28611265999014596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00466329131578748}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.09397440376869791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001967935502764586}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.014752341752067778, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007834696465856099}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.07635223264261566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029933179291138446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.02250038672501546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001081320815406387}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.055742312174539926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013139324791962169}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.25613180684246944, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004131606370455397}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.08117813588062889, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015689150539593354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.056738696823048364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001510447808113087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.25166816812712434, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004300637878529783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.08140414401609677, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017706010471645551}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.45356160811555557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06126183633549046}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..117c34ab942c96baba1d11dc66c04e86e809e90b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.4751602143647354, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00556979352231675}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4529875909582768, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004950307941168847}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.4149721611957904, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004307580646927434}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.23961216679613442, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004369953215662577}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.22983670660533184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004036151644367376}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.20685824798380176, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0035445318812293513}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.38776635650699914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004900676753268663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.37234842080956987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004351586149131922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.3372488007777988, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003710311545338347}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.4136295834528666, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0050805763062877}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.3939649739250141, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00446027620490063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.3592040570850069, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038367843690821552}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 5.946501202386316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.24351717513713797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ead171095d2d80183307200dba75e843b44e2c2d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.5846333211355946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005740649275229668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5013225392819405, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004823682144003579}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.4874115938302624, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004296628165610245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3264068039295695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004964606354905216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.27678176045081004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00418664596670321}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2674108062793395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0038746688112602326}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.47701281240617616, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005208435147495593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4135138973656255, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004400914707960098}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.39705535267405395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038555106605338767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5089420537199473, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005378042598242251}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.43476269636565773, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004386158883269227}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.42141948798275997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003907811087276578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 9.556195074080163, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.6010773991832432}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c311716fff338fec490cdea827d7b39d89cfdc0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6211961468572741, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005451043928857081}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.50338330282331, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004845151895746252}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5118474084220225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004211249350411945}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.35764468264912685, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005000680935796298}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2872787024309365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004370883087249544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.29116539392687324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004106307235430533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5069648439596643, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005096748917589604}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.41279400233423746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004428779743643375}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.41711953001787855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003920437530291511}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5395270078765012, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005190497474117719}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.43551143891048294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00444172146138816}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.44211630816776704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003913688068087822}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.602247258212916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.5603144875513827}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d08a6c77dfc575636f1cfab36c431897c139a50
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6366194970225395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005117591858573804}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5064153790661966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047764966262347315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5226187226330231, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003919125144592862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3676900136060854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004895179178207714}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.291575080362687, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004386520481516148}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.2978772889798726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003939061542109874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5197169335498214, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004853089161070492}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4156312087117916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004450919756390418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4260200185326371, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003725170111967127}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5536011748870678, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0049218194275952665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4398531089513908, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004458785918722786}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4524894725742852, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036906432733459713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 13.861907897053662, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21106760860433635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dca54347b93dff059e850d1c15c63028ad0e996
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6489899322749847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005165184579861035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5065958613699326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004851662674645652}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5275749184040505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004065463426159502}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3794125655803978, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005037370423173522}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2915521692478834, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0043068877311057425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.301972918522207, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003960139499843082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5300511231875715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004912600312705692}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4144029433426168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0043879949295119345}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4292023909929102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037453014462275973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5647334651592919, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004986228066141274}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4389205713401686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044359561894050025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.45588057276344957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003722166270754704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 13.715865491939756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.27544078012455814}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c516bf3bf664fe15d9587501d08a0a69cfa01295
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.14151848718302718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022335651457076155}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.24752522343029543, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034583124706848373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1670780364841084, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002317030768961082}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.031513222807695855, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008276838755083649}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.057870603612401005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015707399503954848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.037518035745726125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009219458435218357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.09919806351581549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014930174402303853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.1804489451534078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026185168421145825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.11838776198838015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015710210393614862}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.13052622432432498, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002064356881448907}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2290141140670916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003219882829713742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1541763589711041, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021393511225024746}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.100034714712357, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0935734566557911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..366394f96d32e8dbec48589172f9fc9172582b63
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.21496359349054053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002755256183241888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2977766333640761, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002926238063431334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.21945211051932914, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020025608489888857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.05319078037486938, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013739477379752152}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07248231514348752, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016065512771453777}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.05267796473203011, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010630123803576901}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.15474418633419101, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021100703969687915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.21701386393503583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002266202639453047}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.15695037484665128, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013801818301413765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.19867342000677637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002582375862443155}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2759880519793925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002754502157352143}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.20282457983589916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018679736258282954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.663367618392156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06404842058801877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b253118da2ff053301cf7089663a423e29860a6f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.25214239134804445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003137662714570643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2925282850559076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029278623218607024}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.2320888922471032, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020359189995027894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.06893560361669435, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001724674687618289}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07621541050409261, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016383981894790049}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.06025475825878314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011801337536938873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.186139567553756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025221021759129925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.21455509093210598, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002256479118960565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1689405481199798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015013829962533184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.23421093467716025, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029753113494921965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2712212728272204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002749323743170995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.21501937359842724, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001910276437232937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 3.204510495424181, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07787891887551489}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f27a824cce6ca5bd285c06c405b50467570b85d8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.22851215436123426, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036662365296823663}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2318184716721262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033564149292511568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.1924533187886367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00245294080699731}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.06394913109289467, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020246406144319314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.060403040894099265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016033403202768616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.050085198320322696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012243114894951602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1730054498692057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002985153102701943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.17225807912085475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025651097238941806}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1426385624538846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018324750437799981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.21351220553283318, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034848884034780077}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.21522664273311504, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003127002658550065}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1790464884859616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023023834259245433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 3.083887671263152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07310310351916077}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f603c51172e014b90d80117ae77a318f19427453
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.07835172075365858, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030934416870520387}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.07356972793882564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027423636784068436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.06162695012756539, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021686829458294928}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.022811004840934415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015155046141358778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.019466079963312354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001118722193367272}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.016346805633122616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008821546979197065}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.060220939468153434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00254445620539224}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.05425129446487449, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002038470215681771}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.045627479104232455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001618799146251457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.0732763918547995, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002947411739298198}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.06755598716044736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025110067465870718}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05685236668461361, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020019069945384667}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.1862301901852318, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.019321081401488156}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac6681827b24a64ca09d96401cb88e461415a5c6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.013258396168294844, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014673465961484976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.011123987415724398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011531151327320928}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.009675821577797613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009637023261395425}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.0040653951508634375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007388608602909922}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0032335182862330307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005205157707758816}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0027133544163391752, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00038619181882107434}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.010646328299525897, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012705323854153526}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.008682769460160939, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009333010065493198}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.007456643010130254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007593140109279754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.012564310140686598, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001415778942442465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.010438016150849468, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010937333492898294}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.009044275613373771, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009050612725574305}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 6.622577066013761e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 8.422075320916485e-11}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..01f7a9b9dfa886fddb2220b9714738629afbe04d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.06164473535235492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001405426454029671}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.09131698896316866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0019207087843768908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.06747377008609552, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013675427733203485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.006790182377862598, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00040039637533508676}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.010655783228026442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006561371102920868}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0075404220501442305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00042912197634379046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.054017605881063746, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011708433559756229}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.0806353288622824, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016043982386113617}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.05916486574473254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011048168711011383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.057274185187926, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001306487045995208}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.08527500630924882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001791516197068332}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.06273421540506535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012595672988991648}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.43426362863975315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.029904347346119995}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa19bf27a67c717ebb9f8f2787261d6393636504
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.13567067805697516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00244783829708697}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.13624606163578262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0022994798156016536}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.11604016621744329, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016757575785151973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.018250954077339773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011620613888053647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.016843917270409188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008988934545392962}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.01382140392181167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006697623363819288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.10820701273702638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020324978288905455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.10754510337495378, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001775180479340234}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.09122973644360803, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012546408754974406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.12961910351952088, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002334378178824185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.12999728653531273, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00216120080161998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.11080814949606399, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001578435475966608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 1.053879603834425, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06162451398173484}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1cbadff1a610f15ea0a4f47db9d1d46c3ec9b79
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.24296232579509847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036738254033485385}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.2154735961186183, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002951643463846921}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.1899984072177498, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022467111119683423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.06583133042088486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002122690639857749}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.053729671295266325, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014610122704433743}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.047456671275884474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001201924712858666}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.19235125113808507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030450441817123828}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1689714728400649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023296670488581668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1482075417640661, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017139326615536051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.22948210547962722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035338194011097584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.2023936742681393, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002773600109323121}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.17874509637448255, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021236174797339756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 3.0768418397866837, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08871699114579137}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a46f4de27a1e92c9fa72d7f5764033018425d4d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.2367568945386193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004029356352539418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1872797276779158, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0031039333230205887}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.1730409097841582, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002502695385341917}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0682055839178317, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0022669198450948843}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.049755521305723714, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001464354667787732}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.046145260990371584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001243064106408716}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.19013952640409754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003394147032570451}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.14748074449453355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002454574356183073}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.13627482491549403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019604976905496277}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.22321388782734303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038594191530477748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.17567553909004804, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029251708582556098}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.16240071165527958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023599640917498998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.5833089949318944, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0845794278500129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..aba9e9dc6c60faddcea852b2da9869ae135e8f96
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.0826705327963426, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003344790852751325}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.0598723767707158, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002421888232149976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.05622854248419195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002109056978999073}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.026818667629535033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018242436179135937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.017564451176379225, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001098422296088496}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.016425733148301386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009348754957675916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.06803138505166514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002872500027172234}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.047513674740425234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019289443522760462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.04485111200457293, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016806426649935266}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.07839317558374391, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00321469710365829}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.05605017651829644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002279210434399837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.05278414549118305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019875613494061855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.08092191325240405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.011429683298761291}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1db19174b397c2ea6a683d5b516d67235fea2c9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.014632085154467636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016062592024331424}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.008280526295669806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009222495294936597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.008375721168007073, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008677802247575033}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0048406013014464944, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008704683931109317}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.00222638820794996, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003520578406679729}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0023532242956938723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00037781487436313303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.012386669107254842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001411967038983594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.006765656854714533, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007480585879119922}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.00684440636036312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007088169655746148}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.014102741266491818, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001565517437796769}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.007844713254559151, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008781423555840759}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.007947675842187496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008241812272510267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 3.3774607869837585e-17, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 4.741357062272074e-16}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b88f1fd0c600e6c6d6c437d1bd5b126ab44a89a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.14123865982295705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020202809729221675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.22307005586506318, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026843151043358018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.15897173803983955, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019161847895579139}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.02549989877808478, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000721064965974684}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.042454775928502095, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001263425095394224}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.029037224536015666, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007750760929234713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10788620148520663, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014459893492170886}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.17577354713606208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021009355572715457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12234436969470364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013376679737405277}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.13146543271367658, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018773489651251884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.20864419850100588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002524852725911657}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.14806558080664062, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017682134567323948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 1.1886223689461488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04424171563604672}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7c3be6077d58e5733840ca6953d0f42c3d92662
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.1257754761068784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021554530530307884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.12439497879445872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020706655200342433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.108101701701726, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015518401988124507}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.013364384862998537, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009762375321509258}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.012700824707301519, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007318396121340969}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.010536736032612563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005708911085444898}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10102917482089223, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017876771518052959}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.09927362783573078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016105321153965717}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.08587323698176857, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011828740918808}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.1205631392182017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002046685977843568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.11957668500631019, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001979489711031859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.10374596670283773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001466352422028802}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.8511460277480039, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0888846234785566}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c60e276ae4129e4ae0f0d4f280b9a6ff256eef5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.2456527083977364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004266235274873931}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.18111674598382005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002792493928605254}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1665670097445064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022225643848745908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.07134765002917946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027529254958689104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.04279317497612642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013473083232237544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04013034602187669, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011603724359419611}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.19750190808423526, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003722471595541706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.1410130748053428, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021637107870954935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.1296562897314334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017086381457813544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.23277149049914897, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004135498321772824}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.16985926116644126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00260288060701269}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1566152490230738, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020971815171293004}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.4589494304991546, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07199650932201095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bdee9a7e82e64561d15bf9780cfdf9e428acb379
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.24850283041195112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0046336444774253555}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.16076069150010056, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029394946336236075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.15523387532764027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024315620580638354}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.07693886867031917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002930517622308709}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.041690092863002085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013593030670533894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04094764568299239, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012144142736985179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.20269496412708723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004051291594703973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.12562253037302593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022620290785844077}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12195613865782994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018817813465346709}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.23597732477141262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004487490296782531}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.1509130671270635, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027512008497031313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.14609016469928118, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022937337012322694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.043763896129358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09423626302294519}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..56321622d7d95112df49654f7b38d493b54604fb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.08108654249220046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003456659471152361}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.050899313397824254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0022355417372530366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.04938144254196313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019543120580950636}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.025696529427655938, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018788007092688267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.013858832830745406, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000977245425782863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.013439239930666353, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008104222029332782}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.06722277079505945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030105977885906063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.03997350902589808, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017457034603758368}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.03922434499043311, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015568603703032396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.07726587114739103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033370296423210406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.047824936820165094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002104240678200406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04652217653531876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018423780684281958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.025383968929684336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.004535204806063187}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e47abee2ecfc9f4bc799f04b8051822aa210113
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.009996893133935812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00132769559493904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.006640185975806153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008719308871004512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.00617290599529864, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000754682849470479}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0033897673826744925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007496264574720278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0017024153835973313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000337089156117355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0016426619083960594, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00031435695145720047}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.008527012867083883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012057970660817696}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.005349623575215652, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007031830380440492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.004975083635745968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006128478221884961}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.009603533312820866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012952829451305296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.006297727244644937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008337320152137014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.005858547923035401, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007214266745845214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 4.1376706030398795e-18, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.7050554509881984e-16}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a18553f427de487c700f8fbaf6dbbd62dffb1a68
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1325770613482503, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025192320948419973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.20000361049618887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033883319122602765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.14568887330505176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024304012352800524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.030157012060673342, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009243132657339652}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04736363206911103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014655640703641456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03380900442536606, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009820401088417524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.10196660477211375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018957212855749772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15791691922428616, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026796281489341964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.11261046327516765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017887243866971374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1229739908341885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00235776548625545}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.18595911596519776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003178608298694486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.13507023756955555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002258433462891094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.36187522199539, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13845059941627463}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6e2adbc160205e2c0625f3114344f41b52c5676
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.253166878261391, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00348988815268275}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.24765343018422936, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029563082706124282}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2120115141592031, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002201889759622243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0699683530277847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020960827184197486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06305007691775547, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015388476493559788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05447927752503051, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012618043977617596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19452273753171953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00287285443881589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.18824812828488288, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002306801191665917}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1602275848002253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001660994268757166}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.23777364194884903, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033469123908777755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2311323526084588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00275352833642066}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19811964314605882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002066579949503973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.3670705128175316, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11604750071877634}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..adfd5ac4e3e49d8486b74516c7ada4ff1e80cf4d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.3042280180045783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036548780493867858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2657722896569855, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002856010555746096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2423480992295036, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002213386982039197}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.09313769731615627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002310899996162063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07567489802793778, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00165355417900699}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0696826972387138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00138974451583577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.23678748519073842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030313841471723817}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.2052468833712137, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002285031155511772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.18663522201860716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001740118367633801}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.28748422717826916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035473688052382567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24946630704943015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026967966099014384}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22792203815398218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021140458556974436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.453155894580253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07102095729922746}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2e22ef02e50ad0d13fbdcc54c60ce65ba09228c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2674845496647863, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0039889863741841}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22160894672160433, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032468878599473164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20630036414089614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026291569934361863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08211313390771198, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0022759780285685387}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06436903346497445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016476213678321223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05970860054565866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013699064127955533}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.20900633859522688, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032474779669193566}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.17205068176936356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002592219077108911}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.15931256883075381, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020301040819156045}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2532503820549326, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00383540860381719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2082619543002508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00305787284903526}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19413853066180423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024798764798439764}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.3998244876185093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07879957613227935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ee844a3081c531eb997a532c17223627a0e90ca
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.092208189110962, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034189214358844324}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07224586073962391, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002716708158950956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06668350472619872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023223709305170443}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0276141917171982, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016764660911735634}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.021177864192540377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012371206877780988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01919932938843845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010049481476114978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07354514541641334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002828654152146427}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.056562703185008854, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002168458322152195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05216028088524061, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018383425985936733}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0875093402820744, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003281948169701787}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06780620805618982, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025609050791819783}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.062781251763141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002197731580536598}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.13483879286922137, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.011222185156728685}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d176eb690b5dfa87c7ba9bee7e8a66077f2caf6c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.015272421462795964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015642398537380597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011340665089531033, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012049966779817216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010526309548207395, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010240475142557462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004861708599486756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000775800963972687}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003729726793524184, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005770564824763594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0032474649318922875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00043456520285324997}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.012655622009735225, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001358908157095457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009170085952684662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009929814693818242}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008507384832807946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00083698404273872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.01465587941098198, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001510887749703006}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.01078936789465723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011506077585367758}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.010004236605713649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009708230001026674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.1051284504073666e-13, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.6395896486810652e-12}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6aa2252987a4496810d69a50e154d7676803997
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.09059810453686068, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015056342386344352}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.1275422175021168, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002161826704046198}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.09697584440146213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015059466263845065}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0074501677100613254, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00042807734501380953}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.011696196090971528, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007203986479520007}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.008274258472602202, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000462953442764119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.07617050446768113, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011196210281897148}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.10846177006819308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016681767488937356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08174820152892148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010962242599486596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.08401444296488689, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001399793796844978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.11878346946127412, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020229669439021116}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.08999801889182339, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013981813175196261}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.510768349225832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05232532369609947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..47515ce0e870d2a5d7c6eb86b1e206fcde1e07aa
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11804797381560325, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015596770844967052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.11938733271917377, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0016474965429255735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1046855555474759, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012419703179327097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.006483865876805875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003960380307594885}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.007429698874370853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004988250290226641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.006046177285954043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00036102566099162976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09444605727198088, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012149153617127752}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.09562634721888393, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001293071021485607}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08322112417343631, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009222719266399656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11382960740162716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00148706494623472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.11526021094661845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00157453107301123}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10097615314050218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011811528071622318}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.5467096839704352, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.040222632718890344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7669c945a060988cb9faf99e57e61c58536e7c9a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.15227977915443636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023301138728988053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.20288951458698465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026600382652853857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.15231216408550322, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018601662730155298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.02590295524115366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010945836739744803}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.034150951995566774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011817222469903002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.024920583594549812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007991257423072374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.11488386720934148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017873853840310852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.1542948926217176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001993853857137535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.11411475231199182, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001289095544101643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.14358124760794166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022003147816700226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.19123947411527695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024807666639582162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.14348417776161898, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017290483390471727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.47208900301148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04893525563104916}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ba652e97b7e28e75e033234006b18b59da3a3dd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.13714187658215166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029978515027231094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.1608525267779809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002968280879200802}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.12327064005790836, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021375429901711062}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.029404520837665662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014427560961137382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.03107950418628417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011977089285448693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.023889029493667138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008571997910771259}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.10640730246971045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024200277079664346}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.12448069660646223, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022996154759259723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09418628790342458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015811319768149715}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.12921646674810666, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002856413380294782}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.15106265513486006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027769655757646137}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11568545523374031, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001996893570244593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.645895957917865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0936463946325968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2add0b6f42a3f51ecfe545cdce2c38a511da803
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.03827905412528618, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020579127622226234}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.04203913187572869, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002047851514306227}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0325446644737222, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015222836283055165}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.010034383577420857, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001074414724889592}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.009208465166841484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007685505603330088}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.00712730989622718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005315862223791575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.030774425912045378, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001732074858974945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.03310013864383069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016065163081856765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.02542231281139547, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001170867527428399}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.036121977078342034, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019666280693929816}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.039179240084264884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001899803034136687}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.030420087800970795, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001418424307914252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.04745666192513143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.007477301550260494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..951a69c6ab7a2ca28c1f38617c07c329e615668a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.004178730643014739, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008310089172819684}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.0036527771749499754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005975815933235252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.002971493717909892, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004944141714386947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0014207035995972678, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005157762753599669}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0007326441133672487, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00017937753316799973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0007222549546163281, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00019897321052266074}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.003300054936861825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007057934923975211}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.002763617912706572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004345967221402754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.002227968389907064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003632109933122061}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.003989732174305017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008013379409808136}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.0034524072384871265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.000566274664103085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0028157162884264765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00046819601162021053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 3.148083063194154e-22, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.742672408621782e-19}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..84ee6abb9e697579a8b48855837c6a3863e14fa6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.358, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015167928865407555}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a64dca3f4c38c60970f293cce31ed5520ad4066b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.339, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01497675877162034}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015039986742055235}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..df3778f5c0a14d9f31791a9c0359dc72c89a856f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.353, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015120172605483699}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.344, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bfd29859a6d44be9638162bf94f752513706460
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055235}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.344, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b19d8656bc0a6d4982961c71b752ede75f5f6a1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01499813134840271}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..42a80d8d0a9ba1f50661bd674128517cb0b38917
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.342, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121728}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..363a44d7deb0dd0314c5171d2eff0aad6d03fa28
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.375, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015316971293620996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..49c26fe4b10317ac9d54a75ec3932dc5356903ea
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.343, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356953}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.341, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2c791c1a2ad7c8e42aa6ef6a1a8224b744a48b7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.361, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015195720118175113}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.359, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015177264224798596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa58377cebf5c5f8d6bd29d059f46b89caa520e6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.34, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363935}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.346, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015050266127564438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..85cb4c18da3e92bda899141db2111434080356c4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.345, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055237}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.339, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620339}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..73bf1f87a3b327d4d937ead8dd6f2ec267bfc703
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.342, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121731}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795023}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5604b88c3f917978bcdfedffab2eb6fa6a062100
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.353, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015120172605483703}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..61d5d7b59660f1ab8ff8a8e3da51bbed5857688a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f6c3fb89946545711c6c0ce3ed468829fccdcf2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270334}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541031}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..08cbbbd7f85d3dd0fa4fc9d3e158ebd896fe5614
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055237}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121728}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ae6047d6d89fd9fff426f921a51afb45604fc4e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574885}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541031}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..52d35ccc327bcbf871e51a6bdc50c6dbf6e41301
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574881}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01470919305605713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..957d146a4eef5b39c3ad957e80a1455683a07e4f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229857}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.312, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509005}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a8c44882872eadfcc79257fd1a0711656068878
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.337, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653603}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.339, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e04933d44d05c0a327702113fc61218bae7b28d8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811485}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a18896cab96504bdb03f15198d846290ddafa829
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795027}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dd98f37ff4c549e3163b7ab05c9e63750330f28
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.341, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402713}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.341, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee6abbe2a9fe0b2d394db26331d2e681660910d6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270336}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456736}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8169f4443da16ec8e2b6070a520b15fd8a70ca7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.354, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015129868238451772}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd840d918774f5a0116f7021941931e7af03d20
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..58e55fc65c36a134db58eada4d812f2775fae01e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795027}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934649}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f127098f9ac1c81005945761d515b4a72107fa5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.343, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015019206922356951}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.34, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363937}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f39467edc9630843046c8bdbd076d5e07e21036
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.33, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456736}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..07924d6abef9ad5ee6b5419941f6a2883482c257
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811482}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996681}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..290e35a22d998af428bb052ceb13fc5a11112aad
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229871}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.361, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015195720118175115}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..67686a5640aed0ef81aa89176a9455b2e2d95af1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014944140233795021}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..26e79e6d31863172c9214532c31fec8ba211c293
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402723}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01493311749093258}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1229b7353d8b6f6377800749de73dd4c8b9b3823
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.326, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014830507204541033}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01493311749093258}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..11db0399dda9e6a03b0005a15d2b2fa1d96cc8a8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.327, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01484221315341124}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.319, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014746404865473474}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..841d364c629126934e2d1ada640e979fe55c86bd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.321, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014770821817934647}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.305, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014566646394664371}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..724e79ad99b286da437c8d980103e43b86ce72b8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811492}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.328, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f056115106d800010723cb83875ce965a9d14ca6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.33, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014876872027456729}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01494414023379502}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cb41257ad2649d5fed7a92cb18d1632a528238f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.313, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..165c52d85c8c250160fd28782c466544481e4482
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.306, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014580006055436967}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c00ae13324770174aa37b965c83a1003f9b234e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.304, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014553205687950432}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4df42dd4aec99dc422ed04a158d9b977094600f9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.312, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014658474370509003}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.311, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014645596385722695}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..76937864a31eba33aeac67e01a15e2b37ad3efea
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057127}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203933}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..48eb18e88cf3a4d313eceb2e96cc10166a5a38b9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9d4f581d1eb0fcbd192572d6422417a237d8175
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.308, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014606483127342756}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.313, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977883}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..07ab44a8ca47db87358c160c51728fa4c416fcc5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880212}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203926}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1cd9b5a8a9c3278f524626c975e6dcb2a03ddbc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.308, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01460648312734276}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ec6b422aa33600d88bdb2a9403c0b09a5103365
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087967}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.317, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014721675438880217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cad78c593bd924691856133995a900ea205dcc9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.331, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203931}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.34, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014987482264363935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ef96a7ea36dd144da71132baf1e2898f93a710c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.317, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880217}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.317, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014721675438880217}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0117cca5a51db8448920ba7ca37496e96006ed59
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792518}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.316, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01470919305605714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6904a8c0383fba1291b7db606a2c9c1902f8c683
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.339, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014976758771620339}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.341, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae45f739a9e686cec1c630978d8ce4e925c0d488
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574883}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.324, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..685e88b6eca80706d12c7c1c9cb0d409ccbe3b9d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.334, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..74f8ebe03ed7665b3568509888feee7f60a02d60
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.309, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014619600977206493}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..025f768b0118a00b3eeb5358c8466648be2342e9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bb7ac3d75ae53f918bf91d8dfcd9886e7c7c330
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.32, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014758652303574891}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.309, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206486}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..deb5aa825b70efbfafcdd4df1331bbfd71eda53c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.305, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014566646394664378}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792515}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5229a3cc852a3797fb802ce389a99308c71307c1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.303, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01453968371053524}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.306, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014580006055436965}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe2fd3f02789f3cf880925b185f52aa4c315ad82
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.304, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01455320568795044}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.307, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014593284892852628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dea582dc6bad0976fa95904ff018b224d81cc214
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01364760294240639}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3408333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013688600793296939}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..069f68498f39e582bb6991df0e26348a9ee8c15f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01364760294240639}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3375, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a0bd5b4340a1e1ba34b90217b3709c4a66e2674
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013508372867300219}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32083333333333336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013480882752851557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfdf4820d9a81f654d0526f0c8347a4391c30449
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.32, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013471620929769135}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3225, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013499258621103244}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..227c67552038e8daf3d31d44411324d571860cf0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.31416666666666665, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013405399314984093}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013579531277800925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1f239a486cb083995f5674c13997802739820b6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3275, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251944}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b4022e8eeb33f8781a376bc829ac61f9f23c132
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3358333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013639261190932879}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013490095282989521}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6eb22c8049b8e6d19ca2733e60b1b357b5c242ea
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3475, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013751753243291854}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3475, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013751753243291854}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a3d8461ca9a61bcacdb522782dcda6dfbac1919
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3416666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013696658778002515}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463657}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5474585050ab982c25be79e7f575d49833600316
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821474}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b73a89d8043a6c201e9f7ced61bda170ec36a6f7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33416666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013622434813136774}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01364760294240639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6afb41a908802a256ff87bc01f73d003796ee0e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3325, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013605417345710526}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01356203291952902}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..330fd5d62b8a6745ccc8043aaedf82d75dc968dc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225605}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3308333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013588208070709002}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a434551063fa58e56a663fec4bce9b6ad4b499bc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d72e8c248643bf9d6c806ba7ab1f76e05bf1dccd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32083333333333336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013480882752851555}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.30583333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013306526255831156}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5f806c7997b07aad625fde495355a3aa69b6d74
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013553211167251953}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4335aebda17f05b6294b9f6f3b0acba66b20e1f6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003663}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dd8676d86f4458303d7d9c77304d1809bc62e3b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003663}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3275, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01355321116725195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..713e6231fff6f91960177c301b7274f23ff72106
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529017}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.30583333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013306526255831163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a1a8377e2962500bd9b68437dbd0744b0e00da7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3308333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013588208070709006}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136778}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5517f0527fd40731e551d0deada0bd89114dd4fd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.31833333333333336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0134529489969963}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32083333333333336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013480882752851553}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a01167419735cbce7b80afb67604977cef410a9b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003665}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32166666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01349009528298952}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f36b8973d7dc3726b6549342ae9f07233f00d495
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013579531277800922}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31583333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013424568830356443}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..420f687e21f3d29bccf52776584cafcc01c83c98
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3383333333333333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013664144006618268}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a6e6d3cfb5d5be3ea77dceaee695800f9b4bde9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013517438120881629}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..125c2a88e8bf1a07f3656283202bfccc8bbe39e5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ac7dc1b1d86476d4b3202fd4827f1eb0d2fac1d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225608}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013462309712005134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a11c406d8dd3a3e08ee9db017a9f64a8893cc673
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3408333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013688600793296936}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1eb265380bdc9df78e423a8e71d2f39c4fc5e7de
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01360541734571053}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3308333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013588208070709006}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..61912f2842ee4bb51e4d78677dfba16928038a92
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003663}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013655897185463655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eac0aa744ec9afea435c0ef1258697d3c5b1a9e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012320858834772273}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012320858834772273}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7408e2763a6192369ec1b626155ecc87736ee898
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2235494880546075, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01217489663120261}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2235494880546075, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01217489663120261}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2aa9a03246cf8aad56777acbd116a15d86cf7652
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012445770028026206}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2380546075085324, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012445770028026206}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..149ea6f883f10503cace6814b8996b45d7950a3c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22440273037542663, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01219140493860384}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22440273037542663, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01219140493860384}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3aec94bbf837365632b388ba512af12d5099e081
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012476304127453961}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012476304127453961}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3706e5e119327ae5edff3900c92471110855820f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012383873560768666}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012383873560768666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6896fc87f84fbf9377c9ef99e5925ef54636ffba
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2738907849829352, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013032004972989505}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2901023890784983, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013261573677520778}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2230201991fc809e2d91926ba0d275a37b22315c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26791808873720135, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012942030195136414}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2935153583617747, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013307250444941127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..794f359d9914a284e3318358ea39a04293594747
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.25426621160409557, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012724999945157741}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2883959044368601, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013238394422428162}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d1dcde59348365ebc10dd444abb3582582aa43d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012383873560768668}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2815699658703072, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013143376735009014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d41d65cf92cd4255ec3adbce50eca9fda893f801
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012491468532390573}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2721843003412969, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013006600406423706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..408651902f1355d98a6e1b60873dcb3b77cf7aec
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012476304127453954}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.27474402730375425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013044617212771227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..be96b4b46bcb8a38a252b6af24345fdc74526be8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23976109215017063, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012476304127453947}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2713310580204778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012993807727545794}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..477561b89b3daf1c98e46c689a080e3ca04024dc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23208191126279865, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012336718284948854}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24829351535836178, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012624912868089753}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..42a99315ce7a021a0d932f55c4370e2e250115ce
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2363481228668942, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01241496052430184}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012610352663292673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ee8cebe077beab0108eb8265808daecc1297091
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012581033453730114}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2619453924914676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012849054826858107}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f1503636f83a0f123babb7c44de449166601d0a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012506564839739434}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012862523175351333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9da7972fff5f1dca59152b12361a8fd00f4c4eb0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24658703071672355, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012595726268790115}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2525597269624573, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012696728980207704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c257b8e2c37d6f7d9d8624f5af7c9acbc707be5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012320858834772273}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012320858834772273}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d73c2d35b7f4def97cbe6eefeaf68818c85b8791
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012352507042617413}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23293515358361774, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012352507042617413}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7d75154cff18c8ff92a24eff965440c421e300f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01225670860232692}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01225670860232692}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..93b0eaa3f6acf30583ff2e86f58621644b22f797
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22013651877133106, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01210812488346099}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22013651877133106, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01210812488346099}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..08176cfa38428f700959f2a3dfc414487cf0ec28
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01238387356076867}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23464163822525597, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01238387356076867}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c70468976e55a0b7e342b899dcbffb66c9f21c68
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0122728535825408}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0122728535825408}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6999658e93f0e1d3d5071954cd1f7b8ee3cd420
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25426621160409557, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01272499994515774}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29692832764505117, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013352025976725222}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f65f43c0248a88356425c172e5e79cc22c890e36
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.24914675767918087, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012639407111926435}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29436860068259385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01331852846053943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..aeeb6a1fd614e331a9ffdd329c33c84e05b6c62a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26706484641638223, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01292893319649635}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29692832764505117, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013352025976725222}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0c62df3c5071344ac4d00303d42dc4072a133e6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01250656483973943}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..391f216035385353004a58742c6367933901feeb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.24146757679180889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012506564839739429}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.27474402730375425, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013044617212771227}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..67ebf905b9f8b4514e777d5968b757539b83f800
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.257679180887372, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012780770562768416}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2713310580204778, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012993807727545785}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fd3c75642a2e175b693a14ce39f582f60bdd974
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2516835016835017, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008905088235948782}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2516835016835017, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008905088235948782}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7abc8225ad327de88582797e7dc739132b56e7b8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24116161616161616, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008778027378258016}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24116161616161616, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008778027378258016}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..95d44e5cc975fef42f5d0b4f7efeb62ff2571dfc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25715488215488214, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008968394768971995}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25715488215488214, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008968394768971995}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..66fa54f2a9c129eb111f54f6252df17cfd3fdfa2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008900141191221641}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008900141191221641}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e7aab1b5c06d4803cfa97ad9eead1ba94a65d88
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2529461279461279, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008919862739165618}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2529461279461279, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008919862739165618}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d934d8cd2c0419b90d93918a5918e2f8e3c84acc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24915824915824916, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008875238553583176}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24915824915824916, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008875238553583176}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d889219afa9f78face7bc111ded78f205c4877b0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.34974747474747475, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009785578618940728}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.31397306397306396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009523245335215511}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..93297d55e973048a0f378d3b5bde3aa86bc365f4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3265993265993266, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009623047038267649}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30513468013468015, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009448531094163907}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..34e6d8bd74d8400343c3fbb186c7aa3b5827b3b4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3122895622895623, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009509325983631455}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2882996632996633, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00929477425202962}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ebdf263d3ed9b6224c4be8e7e87fdaa7df58e32
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.30176767676767674, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009418994158522527}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2887205387205387, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009298805565435518}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..34aa8b196f7230431f3d7bb7197d45a67543d74a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.289983164983165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00931084097076903}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2857744107744108, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009270380606981212}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..24e5e619ca4031a958ee1539f6dd7ce4735183c5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2878787878787879, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009290733161670155}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2866161616161616, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009278551100969302}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2e2668d3e304c58b33a6abd6d9b7d25d91bc276
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.27525252525252525, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009164888895174743}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2718855218855219, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129795867310487}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..979cf635d8610cc77084ae3ad812834d96f030d5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2735690235690236, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009147424438490736}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2765151515151515, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00917788010146828}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cc5dfd6ddf36cd97d4f6999a600bd1654b26f63
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.28703703703703703, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009282621598983076}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.28745791245791247, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009286682281593406}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f355be4f4d97119e173a54d97db4d0437257c55b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.27735690235690236, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009186490105111906}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.281986531986532, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009233124071053663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..187c035e7d84f64eddaa743b0bb030adf4be45eb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.27146464646464646, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009125362970360623}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2781986531986532, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009195059601583897}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6c490e878dfc8ed6af874a651269e790a327773
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2689393939393939, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009098548093009163}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2702020202020202, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009112002229119856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..df5b96e86d9880922e9742d0bfb3e4dc7a0019fb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008910024163218202}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008910024163218202}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fc3aa40b137632da095e35b984024c9203d2752
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0088449845819349}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0088449845819349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59c1b80bdedcd750197ae517ac8a7a55af77ea8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25462962962962965, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008939407288589414}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25462962962962965, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008939407288589414}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4af5e7aa007cdf34ead7e963fbf0694d25c80b48
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24915824915824916, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008875238553583168}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24915824915824916, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008875238553583168}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1911e8320829684efc4b34bdf0ef1996933b21cb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.26136363636363635, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009015838366608193}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.26136363636363635, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009015838366608193}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..81bec97595a99c6207a567440efa33a57cc758df
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008910024163218195}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2521043771043771, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008910024163218195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..98f497719f1eae04214d930a7dd844b5a3319874
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.34385521885521886, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009746660584852448}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30387205387205385, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009437524848293738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..06ef6eaf44687d4d139bf0e7d83df21ae1890dca
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31734006734006737, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009550648343947775}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2958754208754209, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009365854134140057}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef8abaf53e6abdc2229bfdc28aa042a0c359ded9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30765993265993263, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009470292575831178}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28914141414141414, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009302827114597425}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..761b2c9d57788a00542a41260457a8b674f3dbe8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.29419191919191917, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009350328648861737}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2824074074074074, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009237303403479329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..357578069bf6cbd971dbb4847e77d661b3b56242
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2857744107744108, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009270380606981212}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2857744107744108, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009270380606981212}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..04877310b955ee17b99503f51a5027c4b0d43f96
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_arc_easy_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2878787878787879, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009290733161670159}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2760942760942761, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009173559873835257}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..efd9113aebb024452edcd27acf3a427cfa045b32
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5356666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009106972161130877}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6206666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008860362324722525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f8cde9e013f64201787224a15e0f30d56520081
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.555, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009074825112195825}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6116666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008899620943397697}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5a516ff1341c9f860fdbaa84437f8f5412cc3cb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.566, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00905033901089172}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6013333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00894075859420942}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c11fe3f6dfccdf28c6e7cec800bcff477c8b20f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.554, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009076827433934436}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5936666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008968593186211774}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f96adff2c26f6580808d5a4338f62e4ec4ba3d2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.537, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009105198233882231}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5763333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009023204169172301}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..330554ef42c940f19deb4cdecf4a932499c7cb5c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5243333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00911941249154913}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.5636666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009055910870388477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..815e5ab6bd509bf1c54d19aaaa82d349b0cd4739
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6013333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008940758594209432}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef5d7dee62dffdeef309210e674b15c0ac4c05ff
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.544, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009094810160596324}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5453333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00909262640355374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..88a3b452aeb7e1e417591d3186329ca48bdfa6cf
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5743333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009028770205053249}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5696666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009041170615977853}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca0034d7e7faf6621c869f1a03b906a2ddaf9d7e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.572, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009035073003655846}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5636666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00905591087038848}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..28f1180cc06269a3fd679c78d6e21333c9e4c5f4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5796666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009013590979636825}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5693333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009042024977931079}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3e45cae06345cf578b4509f4f19eac4033f7150
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_after_reading_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.575, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009026931658379624}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5683333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00904456300170546}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b687b239d5af73ec925ea1431ebd8b2e4f36c421
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258924}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..71977405e1aa96f9168fb535eeaebf32f9ae879e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5816666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009007620714085663}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5756666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009025076316539067}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2454c35788da8878ada429b1067a1cae8aeb9dc4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5926666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008972056373066367}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5896666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008982215188519145}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..93a1a31ba5d69fd8dba4c240d8c518cf9222a30d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6046666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008927944837940476}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.6013333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00894075859420943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ab0810cb5952c5aa865362d8f156bc23a35e322
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6073333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008917381440148328}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.602, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008938230472973836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4af624d8d93f78d8f36554b15d14860f9d490fd3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_exercise_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6033333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008933122315228996}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.595, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008963915658236378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..13c8e7796b31805f1abf17ce849cd5f2742c439d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5503333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009083851457629941}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.4076666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008973202213879664}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..16f58925472a8748efbc5de04bd475ecb74a7f1e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5793333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009014571254680415}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5736666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009030591966818142}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..20f30d6a10290a13c2ef5a1d77ae42f90e8a410e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.584, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009000463105420336}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5783333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009017486788769118}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c524537cacbe6cd6386c4e64af8d8865a0af665f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5816666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009007620714085667}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5686666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009043721169619537}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ce8ae7e43a373ea9bd7f24161ff380297326ffc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.572, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009035073003655844}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5573333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009070008341418438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a016651c335328293f5bc7a2523785d0b075ca7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_valid_binary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.555, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009074825112195824}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5433333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009095877403306732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e3c9f9db2f3090c5f1f1f8ef6e44444f1410243
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.623, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00884965755342756}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4cbea9ea5282d870056f10ad54405e659d4da6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..43a13a503a84485e2e8c5699bad845083d6f415a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.5943333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008966262991425923}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5946666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00896509146797075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9813d98dc4e48e3ecc3f3e3d3a2c61f7999eeb9f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6013333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00894075859420943}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.606, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008922697920438169}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d531c4fd87a6925d6c6823159101226b00a2b2df
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6006666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008943269429955152}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.607, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00891871708850756}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f44bc3d38fd5324f5bd84c9a35c360bde442ff90
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_boolq_yes_no_question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.605, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008926639623340284}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6106666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00890378508047089}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0f4d6f0b322f8839c44815c1985d17eb1262171
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa3482758bc0bcd5b5673670c1d746ecd8992f2e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.291005291005291, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a419c9b4d2dd0c88024e0fd71298972fb57ca9eb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0663363415035954}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.3, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed135b5faf91a3dbd2ddd51e27cc90dfd52ec762
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.29239200515796265, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..65c0a1621444b9f4b3a2adf253fcc1830e7d687f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.29558442323553674, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..496d4947f2e85685b29a276d8135709e56fe88e5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.35714285714285715, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0646095738380922}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.31511470985155193, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b66ba2edc3127799ef4bff3481374183765c4b35
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359538}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.1940928270042194, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..252f08b40104a80a6496aee9a0736a9b2a6f2304
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.31149301825993553, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..47614d2c007f48224b66854a79d63f8f8a0447fe
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.33210150283321016, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9933f781a5ddf056f7ceef8680299216391e8a7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.29572649572649573, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..271768743b385d3088e43878d6e8088a02271e3c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3011063011063011, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fadc185b1177b4b3781ac221b7b4300928bf134
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5178571428571429, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06737697508644647}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.3561416013304823, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f9d652e80f3ea53ecb2d090700ff924d61bbf63
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.27485380116959063, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c10d43b9e74cce8fadf79852fe2b60d96f910600
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..38aa36612d341f0639f052b8a66d80f8a87e7b26
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2930756843800322, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2318551ba4ed17302a59de8aaa3e1afa22285061
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.28451178451178455, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..59ed385e9e7da46e22d77d76ecd443ad278f365a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.31015369110607205, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c62b7483d45df4194c0d5690268c57054d01b00
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359542}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.30233779879169953, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8970bee59132753627b42e2e82b8fec7ab1ccf7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.2857142857142857, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06091449038731724}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.22072072072072071, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..df5c2436fc0b215e16958534f4705ee4935a13a2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.3808729165425035, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c344dac1772e23b2a71e60762b5e504686ae5aa3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.375, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2798452714419101, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd01290b921f4dc83ecafcb6651efc0ede16959c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.3208410636982066, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b059f1e728d8b85e1fc5420617a17916c4ed4cde
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.3494339622641509, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb098a78b1db1aa3180aa1972b939b8aa63563a4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942397}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.3299346405228758, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4020239343635e4f036374787c3279491662b17
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2695374800637958, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a129caf8a5e4d99301f66ba0a798ca86d081b9f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e465749413925a5fefb511db3dd6738ccce22d1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2824214792299899, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6e865cfb27062f1f87f4341421e90ebb4edfe1c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.375, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.26587301587301587, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b0778e8b81d59a869832f1d590c3101b5bc9274
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3361048122952885, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b612536bbbe02f596d7ece426f7efd628a0683e6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_cb_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06585388898066351}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.28699781799512253, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbbe5ef78ab830def6170331e78a739f6675fa65
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf3ec7f60e01ef8733cb9cb8f9c4d98ae7098536
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..193f1fe90c350e2925719d66acf9df8b5fd912e8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05024183937956913}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.52, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05021167315686779}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cbae98d0ddfa8dae058a23ba3ecd1d8f8745f26
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04988876515698589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..589a46931387a987e97fad0b270d9d11019a298e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1cbea62f3fd5dc114f71a8be9cea59aca5fb540
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_best_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..afde415abcd22b2ef97b8b28bed6b0ff4f88cc1f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.53, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e8d5008fb86706becadf3181dd65b4e5d2dcf09
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..303d8cd6a9ffaed59912071981e3d2290ec5c0ac
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e41f1d34fb7a69abe02a90efd5753524411e2c2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f997e993772cf84d6b51da54f62b5955fbf532a2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f7e6ee9871eec4b8df1a6084326faee0e925f35
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_cause_effect_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.48, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050211673156867795}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05009082659620332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f4a937218e55e7051b5d2775fb7e33d5ddf4417
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.64, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04824181513244218}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..474b539d9b6b983e9212e91510751d5349525799
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04975698519562428}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.36, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04824181513244218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3573ecfff89efe6d542eb9824036d3c3eafaab19
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04960449637488584}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..edade9b07ed87a3f7a43ce6c7e906f5d96ae0dce
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04999999999999999}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c5a70ca153372645627230f02a9a585df782528
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5e25af9ad60d9b38f601830ec1657c6f3df7679
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_choose_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..445dd872b9bd1d6d391e70dca06f822027b83850
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.61, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001975}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7e5ef8408ebfc5c2fe79a69b4ddcd877b2dc692
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05016135580465919}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5619c01104e022ba59789e9eb64de102b4777c70
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049431107042371025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd2ea1ead05b30e5e8b5c9ccea8c5ff2c75d397a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3464130e5096dad599d7921adb0d74228042a09d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04975698519562428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a71843f37f21f41f037af275f1b5f5a29266eb5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05016135580465919}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..25472baaf19ce25b7b71dac4a98e89dabed77c08
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.63, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04852365870939099}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049888765156985884}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..08eb70ea888eb3afbfc2e0cbb1a300b54fbb0d8b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.46, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04943110704237102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..88a2a4a610269ba6577d558c23c9407a81cfc6bc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e0a0ea098db3b962da32395cb27f55db6ed260e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1606843cdca642590a539aa3cd813563dd1c41ed
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04975698519562428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e11006671ee0f6644928fad6058a503aeb428ecf
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa3973f75f6e2cfa0db2a3ff1b592b8f584f8032
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 2.4291343231615197, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1081476953315167}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.30490376061705526, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0041264630846786585}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.28898151867581695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003749068801492829}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.27814178842810267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034140931959471587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.11936753561219268, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002002664631013812}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.10993101885288985, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017805025745999989}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.10747367819482016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016642061918427447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.23846890714089322, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028662860213796654}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.2256288532111635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026144614511853597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.2171258274284509, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002322752684753328}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.27549124953733534, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0039982004120365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.25261701652716345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003253103645087724}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.24738399674674577, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031658939531734086}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..068709a9c44419f9ec0550ac4df57f751f5d54d0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 10.367490688848457, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.147199946045334}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5083228320631105, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004117353888131676}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.3987167805017131, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00352094056862558}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.42359842430158595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003166429715222895}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.23914610666636482, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002866994640346074}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.18408201451158962, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002299318168589243}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.19604007520987582, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022207416309622243}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.37329748745499713, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00342657052646914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.2897412832730678, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027697546621908526}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3087285163618671, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025534325140347484}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.41731459418954514, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037508371487298972}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.32540873261888204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030943465702578284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3463896607181391, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028571074801555145}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2328a9677691091114b6119aa252229460331f65
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 13.390236019438653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14842576200349367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5731248702131649, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035369412980793694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4540586945350603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0031237260395784983}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.48107797071022573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025529853815747603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.2886235972780352, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029184758928492037}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.22509950336148685, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002358526308667815}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2384229110806097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002198745135162194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4274682346753081, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032262516803834794}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3355857923635549, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026048668389623054}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.35636457776206126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002269467152908682}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.47792410828677606, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003441578862987895}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.37679777676824877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028744498867330067}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3999201210250597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002503427292150924}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e71307f14346e72682918b0ccd94d4c1a0db392f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.11211105597524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10622153797928337}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.586920292942165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033015565803634636}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.46387873389087075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029845335098343225}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4933918814621236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023507630543591544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.296797885536512, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027849534966846853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2328967005520696, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023744684508588587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.24703844136495406, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002179120785142707}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4360600199796968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030776623311689765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.34248768718320766, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002574220770064095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.36479402136291444, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002206895287411414}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4892157265900996, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00324195297597455}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3863267663286363, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002838025198180799}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.41093370948084046, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024047099802087667}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..28b9f76407beb0f2ff7c613f764edd146bc710d0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.231011749286287, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12209443108687626}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5917355418032829, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033192291366049035}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4649612536810019, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00292734750937545}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.496456725245094, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023467453451773064}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.2993125833156767, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002816066206715958}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.23258741498970592, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002335135354391098}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.24822901966358188, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002183872217474791}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.43644364494193044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003043256161686109}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3423023863196363, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002561190633854322}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.36531375235590563, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022079073968463863}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.49113203678064726, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032253527956686463}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3864885531751292, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028011091689833898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.41241929556327334, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024101261730711376}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5b824b2f3d578cefc10d14fa177c1c51e267f14
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.053424233811935, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2139098854103794}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.6007655467087342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033178934942541586}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.46208545695254966, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002843128768795738}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4992427047391062, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002287441676423058}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.305416095977467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002803489308110065}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.23208608723439855, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002287994510775193}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2507510925889971, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002153672477363239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.44410464141299394, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030541617625096406}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3408331566739278, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002505664645443682}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.36829390265765766, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021967315271236207}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4996850800292637, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032601873043127173}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3845230626564954, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027439013265701992}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.4152808816287861, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023832871878115454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1143c307d27732ca03f53d22fee4fc152977cd6c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 3.1714342318075586, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.045965301633825444}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.27892126811926726, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002600122184386265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.44180866598838936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003143862734393032}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.3313708474779751, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026143663411672363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.10968173222483277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013140525894327851}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.175414562274607, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020139474282820564}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.13068696654151762, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014575624518219298}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.18435049608419762, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015399304730552884}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3033010356499215, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002174783861918955}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.22199600069535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015907140119287217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.21103060760030867, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020453735196029944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.33604504573169713, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025253980447102206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.25095884676584657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002058529882959225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cede53f959085bf8efd6b9e958dedb4d12b40e1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 10.721803251704452, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17211225173321326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5374028280682072, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037863238755228563}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.41172220237757656, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032971954655288843}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4412622742935095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028484776236360502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.25277948098530284, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002827753217659691}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.189574321126897, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002209880786417029}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.20377550295587993, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021293945685583646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.39366543713570407, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003296033015397104}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.29758323325914543, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026078527492868022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.32017327345810115, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023605669683786938}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.44063300992380655, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003575829648604807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3350879058384037, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029240248465231527}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.36000645986461527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026448711945656627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..877784893e2861e73f660683bf6e7acf2274f8e1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 13.185638820912253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17905937600833666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5847965205117387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033347702831086844}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4531345534571351, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003022869025252749}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.484703373181249, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023713839952825065}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.29228753297166654, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028532818928023464}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2225530720847117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002304379004839278}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2381525007024568, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002136037647803045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4343959678194726, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031186453492311928}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.33311897707583316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025202919059291095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3573228133925937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021528692893471066}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4847599016564475, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032967216795653054}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.37369553194499533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027755064030562265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.40042732880195286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002355966614225236}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d241cec554a8b223080cf04ec168922f00e329e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.107794954151904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07762803738740154}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5895720638586597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003233593739284638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.46460331525201143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029286341256693055}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.49576437128816564, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023177278666779853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.29542191408821167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002760426839218843}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23089746509831843, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002327871040394227}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2459138034781126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002162302420104042}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4352618522184374, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029819501191046195}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3413198904559014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002518962067001467}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3645010870889512, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021578123182128444}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.48929479088617955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031632938136278974}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3854980456003556, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027907143453587617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.41121563779500553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023650416738361323}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a82a38a59d9d408da6d9db28f72ff6be6ffba545
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.409703264220804, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1737734796397264}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5927894623799085, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032205049337101993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.46958390044562126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029192980510850597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.5011786848332749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023379416400234657}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.29855627088086, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002759091662895473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23459170627954518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002340077345372699}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2502413808790468, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021979815342001766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.43466325313168735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029448990645185706}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3431853955769687, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024941981600519157}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3664874450908063, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021711317825607286}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4921604150152806, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031835522336336663}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3900149073671336, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00278747643338989}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.4162953555843644, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024261965876490334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d7e4aa24a1f74d710780a681572b8c21a54012e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.36873102419111, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.24341981242466684}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.595927773483148, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032425483991447954}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4687061585590258, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00287648066845073}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.5021416185467042, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002304993139891524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.3005779660822629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027762341601524085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.23417519742244133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002322950198994913}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2507085203896114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021883812851069395}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.4389314551113407, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029888231854793028}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3445034163549983, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025327223922618452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.36918156449504774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002219219514765805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4937823771022341, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00318964228631743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.38852149707353656, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002762325856795982}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.41611364077049817, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023925606250996302}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d47b18c5549bb80889abd1cad2c9b1fe628c3dd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.03400024051061316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.003924009727004915}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.03163495022271719, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00213402606254315}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.013052648219502593, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007368912322178317}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.01489202936976875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007731390330539368}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.007676432368912842, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007079297568989281}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.005344496240226761, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00042498193988387223}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.005699997724184529, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004296769766697958}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.03128230881527702, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021215996271714685}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.01278853369132827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007111869202364915}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.014595759988507867, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007449503316530413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.031288243684140286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002125501401336427}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.012820813365160109, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007273203927635051}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.014623667224382438, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007603892415303771}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6efe521b6dd664749400ada87f2a1711ba39d0e3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 7.816534164583369, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15565768866774704}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.35702422811192785, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004519300459158055}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.30333646003854636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003920539462119713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.30802363196459925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003686633191390042}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.16461108331765129, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00270536053603032}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.13913189196694387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022575199491451678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.141349628450228, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002168592529358599}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.27136837128521674, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034720568847473434}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2277467839540949, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029218597390161497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.23179943842474957, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002721088306573688}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.2992939969488658, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003911479322747588}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2528327880900938, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033438933458369248}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.2569619426281397, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003144471646322155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dafdb07800cd5d5686a4c7848deaedefeac1915
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 11.40727566060487, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1394149858863304}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.4581076639157755, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004355728871402772}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.39394217549623817, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003758657609304119}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.40125830728731005, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034787841094553523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.22093478501798178, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002899453214143421}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.18922034434713683, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002425949769422214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.19229296968051052, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023147668235538378}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.3370847303471444, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003384862536690441}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2890375434227376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002864993682553475}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.294053280095868, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026379563075927944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.37869382337261237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003810740310963444}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.32547687183919616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032649447682233474}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.33140711720509614, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030511867850170843}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..655ca036f748af7cc5cc4ccf5b78c07b97844b9e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 12.648284327714496, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1412239962827133}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.49870772839379046, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004079935779929387}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.42681008778722584, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003596695473457261}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.4375240256725131, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032382818772945395}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.2429520126991852, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028473077920114502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.20834090112824027, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002501461063250894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.21217343973137, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023188663097364524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.36347126840877136, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031832297343980212}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.3118840814238212, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028265209782573778}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.3186046651652964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002514178531320354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.41125757524693257, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036259652798149387}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.35231454064715534, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031900363709328303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.36066633891312805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028985429124288673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..42eba30527091f7f2de64396462d41533edd5ab7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 13.559446770038464, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2304781739750598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.5259656210972891, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003996774173806683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.4421099389675289, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003360129610911662}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.4587518411625586, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003111470911283907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.2604451621552076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002891203306732668}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.21855085024457707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002416176572188965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.22594698540162944, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023352863911978252}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.38041876582911677, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031927129913797195}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.3208557577808482, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027105884255468845}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.33156794470795, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002486646659841678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.4323128551857689, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003596550511256332}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.36432376421020746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030413424235041842}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.37728696967304404, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028464369344208744}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc01c34902408679fa6217140358eb544c5644fb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 13.545588426744118, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2304131026428412}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.5409229061537895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0039794946613703796}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.448377942904066, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003294797070295318}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.46891543241081723, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030504683364321462}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.2688654034164371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028781695998310276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.22137196577153798, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023525122978486418}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.2310792018618973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022881465661611063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.3878387871633595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003170334663588811}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.32245201866172035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002654582527964297}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.3361248025687532, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024554385707299736}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.4435204322386192, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036158281316008125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.3679817539692809, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002990071151094627}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.3844448063353725, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028242027174122603}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1103adb311f4ebb09c657565a070dae44b8bd700
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5904ea253c30e1418d521390254ba4d64c65acf5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.807245592727982, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14268734921118004}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5694397994427649, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003239379072871242}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.43330345866585573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003044911625941733}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.46557148222586153, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023657331986333577}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2723840046490374, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027516718471111436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.20361914910986256, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022039744834451503}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.21912991483763275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002069315036688718}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4186095218334134, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002968616014528494}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3149916119112548, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024715985756080345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.33955869538197114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002087373657004864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.46699348061463236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003191007134428128}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.35381509149376406, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002783989467610118}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.38066023518841285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023459300318536126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..90ad0a58a5e3138d07f0cf5cd365c9adc98120c7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.207475603022154, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20911303585808974}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5838609934278248, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003219893788304826}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4676453406199286, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003012959553547953}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.49425084203142144, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023454913129034973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2920964834179201, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002749941677148143}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23229590203180275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002340599122551353}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2450476239671336, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021622871202404523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4326595154400953, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029553909822487533}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34449565433901846, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002541682059011724}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3646666089186555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021523804779859127}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.48625392335924433, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031665719502181147}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3893023276736312, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028360977602737624}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.41151419710242576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023890835672487213}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f8afad62dfbb2ee2745d9b4d3f936127873ebf1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.881355021152629, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14798982038506134}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5813338263183422, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031068563548781497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47562332710532934, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029492958281890696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5001074662527342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022864719896120603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2933839366604696, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002656083464844314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23916505911615024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023564329861628973}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2506081474778741, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021353634161434465}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.42886203590947314, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028648415383139367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34942359580045534, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025299540186551716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.36772667157315414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002137446714056548}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.48630515550742015, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030844914644101774}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3982270500785823, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028293510736079205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.41854738428796234, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023767509258669573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fee00bef7fac2b0497604574590b574fef729fb7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.161167960986678, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1434464937691823}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5828889415157229, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003147155952785506}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47816618968937713, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028905507376413963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5031994429048829, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002288693420421454}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2947392128854523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002705329023608755}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.24017998180506026, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023323057167825125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2523951605760761, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021595061752098924}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.43041916996749047, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002887136906682548}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.352331798953218, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025196691672468205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.37084325448755734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002169564896258564}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.48846331616257255, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003116275576376514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4014433365412298, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028080914024126535}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.42211119537556474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002395568997982313}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..724a843d0ca2e24e465520c3ccfe9b0251bcc9c4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.199579399737006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2197710642602159}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5888360095441599, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003159796703393573}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4770950472138781, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028673153056796336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5056858022442728, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022854558866388314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2996637182544266, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002755313978179728}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.24042124929715428, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023404910908957914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.25480077699734965, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002191468155403316}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4337481481034359, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029181057449224846}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35012226658269135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002503515068580577}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3714282516661889, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021747599500282675}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.49452192644513626, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031483071698951075}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4006792202843592, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002780394897574505}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4246436350463985, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023956383797766666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d7b716327e287ca6ac8384f76961918cc3c8c31
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 1.115288380550811, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03584738835397564}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.0481634098481736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016087097616289147}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.10707802462357018, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003385941066987629}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.06478686781175597, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002096643728341116}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.020012235219458552, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007339674775155459}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.04536128791284365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015985026619880938}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.02703438376489087, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009617584924329875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.04072107560821441, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013394434625095691}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.09210140922870967, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002942849426224985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.05510083034332008, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017722509136964503}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.043583758115497744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014653987421120727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.09700126154529612, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030955294628613847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.05863724251471288, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019106515434616845}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd76d4a0c0f1650ad18d20591356e0d81ecc25fd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 11.190352808928104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18141915137411}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5677445861811545, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033984313369014468}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.41532706024078114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002917300932047345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.45251036431033176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023124683743620694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.2744861149641647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028946434469851188}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.19515249470299148, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021250810681362133}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.2134831249471247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002033720545474901}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4249656266902133, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003190196797303736}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.30648054044850537, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002403100236029252}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.33535417773592563, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020836745399405625}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.469797154283937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033750646684610054}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.34104989274687575, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002659577970371275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.372491915618829, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022926855668308236}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a03377f10c6e61a419843757e096b82bfbf1782
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 12.679562321227413, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2222126775351692}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.584577354262805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003347858172867005}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.44205456550510863, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028911343461195844}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4781335252015599, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023177131734020964}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.29090406263628454, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002883255237867255}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.21626343858582678, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002250184676673601}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.23417338269491117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002138781746654695}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.43779082304044564, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031428137257300858}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.32836831949072104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024657583395343735}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3559492850424143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021548825504076837}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.48637793220824943, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003319535039264522}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.36657855951325957, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002716088252466532}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3969389628127764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023548448674264918}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f67f0f8c9e1f0938b6df3a1b9a1a6bd645959ef1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.302937463565117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16122159664144584}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5874865856088282, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003310089916183211}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.44727502252297524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028944779700014653}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.48368846020707407, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023223560319026336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.29353842921389267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028487712438280198}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22132076463133735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023142648024014117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.2390946966736538, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002182786299360237}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4371504816750144, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003083689894416344}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3310191386275968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002498138917180889}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3583981701407721, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021794562630022907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.48871506210624377, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032662459958388208}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.37171338204559984, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027596290462697038}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4020210227451716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023835574293658724}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9036c1f38baf26652fb1e858edc54b6f747febb3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.433876035255205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11981015230222622}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5876353731004864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032800870923946806}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4488101154572819, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028867381201944163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.48569076033953423, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023257013392330443}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.29256227650092403, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028389393416033155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2211736710146012, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002322811471619675}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.23902637874509242, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021849331256108503}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.43579481435548406, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003052897338172175}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3317240836685784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002502354705544728}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3591529229249466, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021946437964988253}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.48827906155886375, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032373730408784213}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.37332408627485886, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002763882689822959}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.403767269262012, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024006083640941635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2cd0805c2b30fb0ad4be5972cd784a7671799fc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.506223827879625, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13712432815643727}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5928496432260469, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032872080307399898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.44991984920712536, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002854639924992808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4887998407141921, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002319156170252818}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.29918813275057654, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028678438572694092}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22416121461467314, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002312463482767483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.24345616505754553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021918190877205047}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4417618550309857, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003074003692942964}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.333769883964141, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024914760495667993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3630039450500851, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002201392663162804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.49454649911344245, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032669183780217074}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.37523026419685435, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027419119193864048}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4076162949696335, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00239783693691282}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fab834ba68c0e4cc4809d29b4ad225516fc3387
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.10961523878541282, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015892800805468758}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.2760154067129983, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003714427703127575}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.15501116522406902, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002143767701315978}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.016503385727741075, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006937152978442476}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.04290433837786585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001838510464413981}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.023537346012547193, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009840202178341868}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08565381552764036, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011225671784216172}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.21734942060398876, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027643020863370284}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12136447200761914, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015209692569129127}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08820687667796023, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012712858820566506}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.2240859928005493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003132931618671561}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12504245478102102, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001738309343000351}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.8131130851284528, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04714338380259805}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..999b7c46ceaebc3801d06ee3a2a35ed1a1c57d53
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11659718760405827, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019742524127726855}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.149104507025026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029509495609674774}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.12255449947246683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001955813890814555}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.006027752453637144, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005191187509380493}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.009307727994348461, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000907548674614976}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.006748722038979709, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005881345903334706}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08952779755714778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001481045986907457}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.11272020580989889, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020202289653695653}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.09345694256222935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001393697589554955}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0921370778595584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001508772594316064}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.11881744272884845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023731154530475876}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.09701653702411298, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015017590341849362}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.4318908013687936, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09340356040913948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9a8e62e30f3fc79fa99573ef86bd5b15716fbc6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.16677371184257006, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003263851048544926}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.1865625419381177, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036738567366226727}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1641766923037994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002787542118343757}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.024073436874321924, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014755313251919172}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.027534094582926417, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015892930983602675}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.023361198351848255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013103167812832366}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.12573595130504708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002419342312394539}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.14040929192793153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026694063784176297}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12354629572040322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020211011832145563}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.12809116280754843, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002429412800673777}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.14548807960651497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002942315760362548}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.1266583625631351, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020977751126439147}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.2185124475941114, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1247918930209988}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..18912db1cd6e4e0b90748cc032b4c2a4f5b96dd8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.21090220523104927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0043400296722759715}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.20623055336862026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004117296972903804}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.19540218617384542, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035981591988257}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.04446812394522324, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002433325283250796}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.04330763423954221, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002130444343579905}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.04043225200079106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001971128849289818}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.16102102863101164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003522787509763259}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.1562623845114745, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031877827820907226}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.14823954382505022, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028266046266465796}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.1632209386021339, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003534529597444769}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.16014367488766096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003360192272089281}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.15084505474710264, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028701096981838467}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 2.384083581683153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19545490163541676}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..61d2dd6e7303e9cfaaba22a383baf824bfff83ee
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.05924863405775099, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037430888437938997}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.05315091802828831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034485744515216376}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.05206730499634627, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032224075534875705}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.013141821010449908, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014263480788354573}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.012179899285613686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012588083212497355}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.011639585528160717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001181669731295198}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.045593974139523485, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029554838918758393}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.040334899215324675, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002664958357606189}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.03966046102164665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002501900408036937}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.04623428184973231, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002984379602534541}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.04133658908689536, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027627667262542506}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.04033802381773822, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025377176339826104}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.09150173402093377, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03488801022859231}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f42b9aec58ec141c901dc26cc5f5501c7ef6aeb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d643d19196ac3dca93593f148eeffe67405464d9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.14342162008284204, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021489787880454073}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.32838306815833007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004520779915543697}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.19594002271344585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026553778162506795}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0324041203368683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013857024425616468}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.07633647234365959, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027120097095923}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.04383970111779196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001502517751586814}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.10834989026863737, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017010407378355936}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.24852399538651415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034523875710231393}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1477613060651902, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019730423409828144}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11337131132440753, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018502354074941042}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.26102578003284527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003905320793767065}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15488179011332442, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002226328551514201}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.711960179300553, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09364733946460982}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7363bf595d28604af2dd7ed5514481634425a13
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.218502654216416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004216701848226965}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.22612472735288322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004050424823157234}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20735885392080328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003416950171252691}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.04666208027558217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002339875983008479}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.04728243481580427, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002144678600899603}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.043166660515364066, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019489298067270461}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.16554354693857987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033634278225486212}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.16887951213275842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029798022422798257}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15586444152297085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002611115681188433}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.16802010031692538, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003363169417299626}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.17423319642542318, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032438974470327254}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15904480903777268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026546734966954384}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.038892604450971, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08659921775202399}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd5d2a6d6b30c812ff43e9a3952271a359d31b6d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.2686202317908267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0043807329681565515}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2442620399832786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037187930025226325}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.24502315675229497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003527107028051239}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.06761930566167147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027582692970284004}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.06045947601825875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002402252483561905}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.060811060789406665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002362985597929733}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.20701714470631166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037001369903729023}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.18782901699110188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031305668524696704}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.18846932992317159, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003002336243674728}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.20779734909360606, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037041900951829566}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.18902539617665143, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003170681415495645}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1893130337834815, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030095150279783}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.4680465130026668, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1806305418260011}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8051dbb8c069dcc58fd0430777d248d5ee1998a8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.2704616336812799, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004785011358046082}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2338132083832053, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00393975832858179}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.23897677492951283, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003816266861615919}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0686867366515392, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028770176560848778}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.058824913888813044, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002334355572633918}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.06053382888543694, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023807363425603057}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.20739899599482822, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004103033934906544}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.17693699968227222, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003197846899502375}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.18162079511431126, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031790525970022214}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.208443470756841, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004123726328746496}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.17832743614767127, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00324196291285183}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1827225485802873, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031988935917358708}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.2188873987668196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21129832875336893}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bad2aceaf2a9feceee0a9fce85dd975bc4b6ecb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.066217183773399, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0042555829917650285}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.05213679480213446, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003385771564869977}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.055389768861489226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034919433178371593}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.017664150375129962, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019205293888410434}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.013372101380357073, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013831809690219824}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.01416209803140751, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014186880826512003}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.05106781746182636, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034158251072776654}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.03979759120276381, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026778266516859447}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.04233447640190309, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027559578153928876}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.051403778989441484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003427827319089082}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0401050806117895, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002691735875078422}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.04262294039262988, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002766382698213603}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.04659629275993456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.019558262715467265}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..984ca2d2bd2ebe46713f9279d9df4e804f01c4d5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..556fb4c78220d11c6704ee1ade73cba561b47298
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1442510190731307, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002377555886882021}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.32728957236911, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005137969407531113}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19658526341354268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030181310698989858}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0341581223240638, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011714333255302305}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08176991191725373, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028112899026499264}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04748504386381342, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016110390499460413}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10623987893735275, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018433462761128848}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.24157333634286496, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003878678496036975}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14457182138352206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002237374752785832}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11372296782139678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001999008147870968}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.25936158541657245, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004311577104350546}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15498205908882362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024811396605881834}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.051690852375094, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0919729448596882}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5013a590eba3602beddb5aab0e9560926c6f7995
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.21346282259038915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004034521833789361}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2499714791367549, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004127977259047798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21363061590069807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033936589765782323}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0463678167671605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0022751074397479123}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.052694807564007726, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002334059403839543}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0455477081115806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020625955375166345}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1620749435386503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003255513503258099}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18819404884414812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031630393650993625}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1613719463559732, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027032991504210456}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16522714514641704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032476758014330694}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.195040375695118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003405905178416842}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16555112242432596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027468382198912893}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.019074157200998, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1854543722412079}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8c85c43634a180331483e8f414d549bfcab7244
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.23937462775757284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004035761510891073}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.25228250851051215, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038577616272618006}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.23064419705209097, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003326199002008987}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.05306913769010263, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002351977805815094}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05441248374509546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002305037996521187}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05025303080761008, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002124751664870315}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1812036165079183, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032724786112898127}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1907813012976707, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030970666700906747}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1742531731484881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026993941796824017}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1831569851914895, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032779593192606743}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.19469605892775696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003266640672239043}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17671769275213156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002734700050326904}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.382425295806561, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.185437417569327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b643a56be940fd49205175405d06a96b9471e618
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.24352976537606963, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004350047350541235}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.23908129808521703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004002524315183232}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.22778807117999614, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036243774679184364}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.05729292572552575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002617653205209828}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05451017420559761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002306325013293872}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0527365635483454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002240194553760494}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.18538163733792024, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003600645888264381}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18116114634018265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032128881913064335}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.17283311820835626, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0029786990259966836}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.18664238451530393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036151543248168095}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.18367415808096732, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003324929336987573}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1743815051388937, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003003683858669049}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.8525428900629293, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1568990466406635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..64ae7246bd49405a6326972496df257e4ef84e02
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06761052931830998, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004168524274964794}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.05943339586169317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036304617540668326}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.059542504834570575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035683053141444022}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.01636682763621811, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016719863572741598}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.014189933840370804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014657982637994635}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.014234207773434364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014260690361126657}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.052604784133526844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003336410205451067}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.045692475599885846, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028314510568171344}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04594672257630698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002810904537853349}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05281145831744821, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033495714563258135}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.046242631863418045, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002916590491225937}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04622428003631112, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028346596378458684}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.15163352455062296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0413547235971038}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7e049b6b204235c3d9198246476d5ad23811332
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4647fad97dbf74fa1e09fa6bbd51f18a7ab1e67
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14340787798544366, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018032543453857495}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.348079749408104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0041382877901599245}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.20057647309126575, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024023673762588573}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.02767775271537802, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009721356559807497}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06937497981951246, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002453319853871487}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.039066544296982385, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013626441060990537}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.10053424559477453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001277020654050582}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.24542491643370598, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030533732704981437}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1407817781383674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017158350976538461}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11420554133997407, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014870409134011945}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.27886116865239546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035333760799287008}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16000303452385659, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020035707234822052}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.5345722016857484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05327551375382703}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9344fa6e71366f204f398d668e233ed7cb33cba0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.19865216741377603, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003516512262635199}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.29135534859568224, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004246772523997126}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.21705829457281703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002959231545359903}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.04118386653309974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019283565176599058}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06151103352010455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024532430125831714}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04491172266106288, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018380470883612547}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.14735138093875785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028518722500306447}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2146519342508703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032903532550251045}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.16019371252800158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00236853277069081}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.15275410364710615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00284109895626164}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.22737937708782943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003605645003436984}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16759564345626984, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024308191123714536}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9172311763738956, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16322447855789024}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..94a9f1959296958da531e9215265ef1360c85452
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.2465315700047466, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004175970619992801}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.2744895062916164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004132633230322607}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.24156257829126068, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034116082967927393}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.060132830131985185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002584710303057828}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06493647553156313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025380809952402056}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0580916953307158, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002306895150723321}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.18689848183823216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034836019281092594}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.20621460918831602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00329546339611092}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.18229586646296705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002854614939760979}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.1907928929206747, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034476831058094398}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.21471338674155627, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035611110348475897}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.18747549293928492, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028727564091906572}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 2.704613633746308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13932816628204614}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb30826dae5d512d521f51d53b4dbf53fffefb86
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.24488137416704311, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004439983550252802}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.2547015400524934, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043629817539744134}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.23419955703690729, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037544069015259768}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.06045943164183697, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002503136670039761}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06245888691656072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002433600168461948}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0575651168139506, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002238805869014741}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.18511602065268223, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003675578589755922}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.19090721673096625, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034376644701036387}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.17601063162717448, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0030542550306405063}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.18745473520934786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036729872607959803}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.19536859046211874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003598817184826712}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.17894501465520485, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003087597155662158}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 3.089437062529702, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1827268409396636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..530861f755f64cd61bad333472ee72cd58228346
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.07353706957375673, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004426917583408668}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.06276117005276965, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00387156032277117}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.06132087500847303, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035658659998728734}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.018229014909705746, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019191437419252921}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.015796133646357152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014959398218503275}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.015003585358323187, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013899419336256885}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.05725219630577079, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00361985637625251}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.047238026145296305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002971356877752647}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.04648951924651044, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027574847885540104}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.057917410235608965, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036412515752914593}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.04864579563916333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031056015721909335}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.04735855139435085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002806718999058402}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.19070979838812588, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06378186049279796}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a302917ca2d53fee3a12bcaf4467923b2cd0e803
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.002549776342784112, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007166030165886934}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0020413905595106064, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005594890729416889}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0022287175815817136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006161246419501403}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.00038085758119054455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00014461738794117318}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0002657839755509012, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00010178048880223215}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0003119009542132786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00011884895101753975}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.001898994303267333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005358267380604738}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0015046723091027433, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00040639938290870474}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.0016459691843579136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00045147423875777586}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.0020030453576513505, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000560832569927041}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.001594104620308841, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004280320767214678}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0017415567558494514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00047483426648456086}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 9.871985357472749e-40, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.721216135694307e-35}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..50b6a55f35cdc432f9b46747ac314ceb793c9171
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.15035747550854134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018537172978333066}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.35742314169547235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004326079160564529}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.2089918826583484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024707731495359864}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0349468617386321, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011091670418172431}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08661816478419919, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027998243524170996}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04915426636292421, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015546014590166785}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.11031701721659484, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013825633108291136}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.26338148416281726, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033183960564176712}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1534641057893518, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018542646706649216}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.1191235584341918, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015439232638346807}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.28484018718916787, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003731890576938668}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16584195681107322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002084501252740023}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.9294630165344384, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05199953236184783}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddb458df9822ddd81c760396ff7619550be9f918
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.20762450013236594, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0039461291351093625}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.24413763817790524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004174823772437886}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.20791172933370228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00329315292456849}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.043001413856892226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002203182319482167}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.05070781231089209, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002301401448382558}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04260592446633687, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019713326310437816}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.1562927305009076, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003049742520195418}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.18293962617097087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031269642052189203}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.15602073715303139, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002503729351832688}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.1593551971386879, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030732593775087977}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.18915937658696833, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033947710998175912}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.15996751955267816, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025845243156891263}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.0042439168055477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13033809653477546}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c73235b1906d022d09c19462d70fb02098303edc
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.2359424064551194, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004094026469321015}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.24137156152029446, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003746084943800122}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.22509114147071038, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003359349736460282}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.05084069401291977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002375545332584951}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.05062417462546563, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022092158246353067}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.047784340461202085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021244464358628947}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.17830347583335351, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003301027358370855}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.1820807332453224, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002954788303945269}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.16971181352967674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002694937124690388}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.17986687172318414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032949814705597958}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.1852104189778644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003082867447381655}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1716985031231568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002716095074624316}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.3963330281306017, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10229956742443976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c686af5cc26d1baaf32f2829528d5440b4d660a6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.23537962078639738, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004425637093001513}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.22790233981678285, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003956303010732718}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.2195806592889226, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003671944538669731}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.053206008686741926, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025279299018083337}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.04971780973026118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021622161152509266}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04843631029051122, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00210059083909135}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.17726440409046024, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003611471694185854}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.17124218344005204, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031466317030328515}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.16496863705556303, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0029641595071464256}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.17842403243803612, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003622742304784544}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.17318872190675308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032217666739748746}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1663330953913527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002986175252170523}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.6146189500481682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1536987337730129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7db2d0cb7e61265b5e0aeedf75721161860cfb10
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.06325338928249166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040370124259995}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.051302455688280554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032918601370581302}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.05273405652690356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032801143050456877}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.015839424272271533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001799340153660356}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.011885180583528233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012328784003733852}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.012504879614731455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001294899307586975}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.050331670340206996, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003309508915566947}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.04029477500488443, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002631169598444604}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.04150375469427366, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002629344620451854}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.050554573055058456, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003318947794768516}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.04070306392026555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026745492387180727}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.04177261698814985, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002646732117471144}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.07355919987680871, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03017235561587739}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b456ac0b172e67ea970f87bc412a38dda46909e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5630cada8a1a9362d56c185ddc9fac40607b5854
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.06565649817244, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.26580129636863487}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.08236949927702025, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0024104850431885413}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.719480590736069, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006398515591482853}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.1346828076880258, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.003005110263614226}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.0662606810581365, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.002335185253764046}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5581501081874023, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007956342433328234}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.10761118468871383, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0030242837703836024}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.08166459222277754, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.002389577144028269}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7162201035906264, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006438467461215888}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.13376588963623146, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0029942137429941505}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.07995628492332586, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002395303523446372}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6988153667130336, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00667405770086968}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.1306994980839404, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.003003875535784151}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e28ecec65f050f1164ed4e62b8b077c531e05e3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 63.26037952408198, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.951804882148515}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6988575984790449, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006588116693745785}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6740866889600351, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007010681242613929}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6727480193548931, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0068474612270140334}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5527918786636355, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007922884516561693}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5405879983491555, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008055493652125685}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5392222989862876, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00794919143323893}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6841121597165629, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0067911100878414125}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6624258059575598, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0072087579910821735}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6606953549883641, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007053863769326496}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6873985752009085, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0067517698213240395}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6643687405464695, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0071647281967591485}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6628897552319865, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.007007884050491445}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b14609dce36c4fd4f1acd4161568526d3cc66b4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 65.7787599516413, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.8805278393831618}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7160937126971031, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006316259371960246}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6983512547114311, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0066631774166225404}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6960318899051244, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0065150837518839376}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5735137941726731, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007734486272816341}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5639314795514925, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007873750511765216}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5618897276378911, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00777775543365932}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7024300295876501, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006522310714591612}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6874458530825792, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0068708930660312095}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6846881504443719, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006731291659364605}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.70510042853508, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00648084314614621}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6892365545992871, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006830029581115679}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6866058410510959, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006688511386355791}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..76e970db059923f9873a5e17677509290182d664
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 68.79659105137841, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.8911607307587284}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7281768854202323, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006091328345208682}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7187148914356442, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0063093252000116454}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7139591788200526, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006222444811463132}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5895951227185078, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007589501024665014}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5841983766424242, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007690796430733763}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.580739926062922, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0076152906715177895}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7152369828707047, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006320392771571204}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7077083520073133, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006545042418523855}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7027473285080598, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0064598423376290865}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7175001091151413, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006279054505869695}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7093908666073235, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006503238973466924}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7044741566673249, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0064178648210677725}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..85d9667a2e5e74e1a076cd25a010ab36a2882079
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 68.89024334929532, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.0053364628892096}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7283756240423309, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006093561817243837}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7204913392251014, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006274653215078908}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7153440029217795, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006190308119095787}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5902361563890636, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0075450855599918385}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.58542613380595, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0076389718894045245}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.581967194437133, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007569458361483834}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7155444610558683, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006310523362454862}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7093415097622576, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006503769590635487}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7041497846834553, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006425679507241933}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7174324105961624, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0062787061597621655}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7109083354996658, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00646856437751123}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7057056561042719, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006390567486985987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3eb3446575d9e254de5dbd0d222aeae085d9e88
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 70.06736987950548, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.9767815709778029}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.729594849228905, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0060710557444570695}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.726939841836775, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0061673276257683965}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7195026463671974, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006122247411756088}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5934420624153478, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007521177536614038}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5917838640682289, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007591836716456811}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5869778194049874, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007530370937645896}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7181192568693671, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006271435942895118}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7166816998141146, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006389531553039699}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7092419533890473, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006342085468956855}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7196391201464266, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006243283678540828}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7181584658507878, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006355241979175212}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7106557360440093, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0063101236805124935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..599e36c35c74441006d63f8f3230a061f96fd325
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5316a3d94118d81180bbc728e4d63ebb5efb12c4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5027203482045702, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665651503000727}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5027203482045702, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665651503000727}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5aeda3460d0a4ee62ba2956132001328aed02ede
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.514145810663765, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011661154475524838}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.514145810663765, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011661154475524838}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d295cb7de143ed495080ca33ed8ac2e020e9f755
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5212187159956474, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01165531473228886}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5212187159956474, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01165531473228886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7e9f5b39ff268ed950848eee6f7388e46f73634
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5125136017410229, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011662170084916896}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5125136017410229, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011662170084916896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cceb94c8dc012bc57bae0805843e9efcd2504f2a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5092491838955386, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663828032649181}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5092491838955386, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663828032649181}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c0af1121f09fd4deae4296ed171259d843cfbf2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.16038917825535667, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.013645183773152294}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019717407526919313, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0005241226898875045}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.21991235718003924, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004124788414398084}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.034295347974031186, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.00080874557662922}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.003053738189335298, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0001553041432895281}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03824249021088632, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.001975265638985592}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.005371789507833525, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.00026020055238192285}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.017812403761871688, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0004288523322774551}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.20443665582319562, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0038249367833209564}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.03113183301172919, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006873859750781993}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01619119548364551, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00043471160357438384}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.1891909150799779, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003730983463664777}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.028214906108285757, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006610920318795323}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f216eca2423f173f9a80c8f0b4fe5a28eed18d5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.2751691068568516, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.013136219804520673}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.06885210549637656, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0028535182012972664}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.17842265399629653, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004223503540841405}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07214164014313752, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0024508893541148966}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.013821947965005339, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0011850586486327943}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03344723252560877, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0019089636877996823}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.014805868732602081, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0011349585009864634}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.057690189785966156, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0023735730007109603}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.1583254724810553, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037922699239114157}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06141675468189888, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0020741009028569673}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.058586296430585875, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0024669791370603633}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.15379774689758435, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0037832161320439433}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.061260958770885235, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021205972969045098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..802f78bc4dd3c91f0b5d988279eb57f086d7cbf2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.7478592848030261, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.05206294347980578}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07482884032327494, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003612480132983617}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.07821116773610592, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0033314055394672555}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.060146538447062185, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0026253423803596007}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.016468959770221926, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0014638792570753052}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.015742688935978556, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0013085844552320407}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.012735175881541076, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0010739021956960627}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06379992803442668, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0031121495725695463}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.06819680109978733, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.002969336578287296}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.05160822400895001, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.00228266908781442}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06657979060268567, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003264654543949671}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.06934836838932225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0029951702164645205}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.053315918135572495, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002359636769296838}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2020df956e3f1a5c906b98cea86f190041c571cf
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.45426082881899243, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.0606143505719565}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07290254469938463, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0036333756842878845}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.07008239737410384, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003310749345271666}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.058422577709111474, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0026946026782868468}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.016529422242712644, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0016568372778018017}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.01616429994036002, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0015418921009779543}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.012703293081847745, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001197207060641643}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06262930735049298, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00319556996686759}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.06161665158045485, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0029931784245315683}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.050359394410487104, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0023702453331618492}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06499144598728437, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0033179552370669625}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.06248884178951301, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003011841506938599}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.05173144445865948, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002423125428506434}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..41bdc445750f3160e707c80544f1fc0335168bf6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.5257099514220486, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.054470098489529585}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.08433502515888319, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0039111129507774746}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.07755475127098414, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0034502441379280794}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.0674817786636535, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0029464283746983093}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.020612038094728467, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0019179044390943054}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.019342108926184154, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.00173678799008175}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.01642293331569968, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0014739545404562904}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.07275740895813725, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.003451903236764541}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.0683379924797514, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0031386155388587558}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.058732697860771205, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002646209414844525}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.07589809456013663, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.003610021093780004}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.06966112094912398, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0031716009555262064}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.06036538574416117, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026965674415596326}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca88570770fd4d1034d87d7b6e6dea960a7494f1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6100817469863049, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.07611933506009459}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.09731785854243621, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.004118827876530802}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.0909234801856218, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0036289176500432993}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07895885565036868, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0030819472178692767}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.02311349812806924, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.001974642909283926}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.020134835356896908, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0017387345774011085}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.017754410094936536, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0014564164985431847}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.08330336586514235, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.003613644785911595}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07943627989992699, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0032943078048450745}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06801592952723212, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.002740700561112419}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.08686183044375809, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0037620960553600316}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.0817329947639033, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00334337780108826}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.07057923291607501, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002814634535587915}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..079b5a901f3bb8ae36a49b774ae93f1011269aa6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4961915125136017, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665485744746797}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4961915125136017, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665485744746797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8929dc7855c3b506d0116983008c805a58012420
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5038084874863983, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665485744746797}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5038084874863983, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665485744746797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd29e27c45ba74c5cc056611c05aafa68881f5c3
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5087051142546246, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664055982032837}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5087051142546246, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664055982032837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6c195c992bc15d510be7facd0239b42f3a3a15c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5097932535364527, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663586263283223}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5097932535364527, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663586263283223}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..888501a4e3f8e118e6b841e174aa5cb5cbdb5f1f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5108813928182807, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663061261117732}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5108813928182807, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663061261117732}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8fac36778c904f77d1f611ee9df07fe5279a03a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4912948857453754, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664055982032843}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4912948857453754, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664055982032843}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfdfc271fe168b1a5cb5572a45947c023c0ca75b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5658324265505985, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011564264866016057}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.573993471164309, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153737544851943}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3120ef34a73c68e996c53053e7531050be85e340
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5636561479869423, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011570895640553714}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5674646354733406, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011559142916063145}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..60f571a935483b39e54d2eafb67dff5376316ad1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5554951033732318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011593746871584154}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.559847660500544, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011581954727227395}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..49b58b0bdc5cd4e43127460de1ed87e53cd3893e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5484221980413493, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01161098935881427}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5527747551686616, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011600659443292917}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..523166f02013d6f6e2607a4ea3b096a8dd399a20
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5511425462459195, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011604638382510184}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5554951033732318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011593746871584154}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1da960351f54d200a9906cd5594a4fc8924e7de
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.545157780195865, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011618148261187405}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.544069640914037, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011620422647622242}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6c56122e2f8f67880efaccd94d40e04d45035dd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.574, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01564508768811381}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.512, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015814743314581818}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b92f9d63babc53876c92ee5ee29aa28c58a4a8c5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.652, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01507060460376841}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.632, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0152580735615218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1ca150e4d7fe7e1e0628d834ca0e3e11354d11e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.656, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015029633724408947}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.646, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015129868238451772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ee8bd1c13eaf58d8ebf764d6ad5872884ed4652
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.654, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015050266127564443}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.653, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015060472031706618}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6021a7e422806c0889dfc779118fb13f4f50327d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.663, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653603}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.662, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014965960710224482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0cdfd1c911bb4da1130eda93eeb1ce70c6ef8d3a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.659, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402707}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.674, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541038}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..232d61ab928895819b69a8e00fed822d7d5e35ee
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.854, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011171786285496497}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.767, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013374972519220051}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..de995356b2c6d04e1e4e3d2c7d9e4c9474838b67
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.889, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009938701010583726}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.885, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.010093407594904603}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb6583a381d275b37f72d7dd147e027bd6c464ac
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.908, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009144376393151086}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.895, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009698921026024944}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..72f90b64494de315bcd77a6d0866f6a9f1990b35
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.91, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00905439020486644}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.904, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009320454434783246}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab4f872e64f3c50e4b92ad12d3951ad1908631d2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.918, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008680515615523722}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.908, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009144376393151084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9ffa6224af94413e24980656f56e0de4e130024
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Direct-Question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.921, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008534156773333457}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.919, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00863212103213997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea2bc5d2ff6b1011e516c363c07eb5c86bdabdb2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.311, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014645596385722688}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.35, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015090650341444233}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8997b82058f2a077622a964f84a55715c2bbb8bd
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.349, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015080663991563097}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.354, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015129868238451773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7911612d0f50fda569786f447bf8722844f91c1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.335, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.35, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015090650341444233}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d95033bd9b3c861db4e064e64314b1e0bab2af29
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.349, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015080663991563102}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.36, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015186527932040126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a0214d2002009dc0c4a7c6d9889d84d6f9233d0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.347, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01506047203170662}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.341, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014998131348402718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0ff0de57a4acc24a1b84376abc712ade5249a4a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.331, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01488827258820394}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.339, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014976758771620342}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..36a25ed396c2a08f8f905c40186d46c27361f813
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.285, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01428212095520049}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.303, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014539683710535246}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fd2440bdf6806fe4c90d916f880f640096dae7c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.287, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014312087053809963}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.288, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01432694179723156}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..52a13bbb3ca9b1287c3dc072e696c9a496ebd633
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.289, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01434171135829618}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.295, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014428554438445517}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c068a3097b61e17f5666569a44a147cf909901c2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.297, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014456832294801105}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.305, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01456664639466439}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae54c8216e54b9b3e9cc0e219dd1dc5eaaddd9d2
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.292, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014385511563477336}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.296, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014442734941575018}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6cfc0f345ddb945f42f3779448bffc14dbd9359
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.299, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014484778521220468}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.302, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014526080235459541}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce691638d18f461bbfc702473f12c3bc3a2d3dc8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.292, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01438551156347735}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.299, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014484778521220466}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..91c795e9d9952d61de22bf7155b425caf796fe24
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.318, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311903}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.326, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed9e6484a8ecfa50ee4c4a3196fb0a8d264ae4a6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.289, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014341711358296184}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.298, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014470846741134715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7325dbd1d15d3fc6bc3c048056a96419898b8e5d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.317, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01472167543888022}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.326, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..79a27dfd4719ed6a14a9f1ab696e088ad24956b7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.313, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977892}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.344, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015029633724408947}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f01f346c856a9246a248063822085ff4702589c6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.325, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095524}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.321, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934645}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7091035b03e6229b0c6811a43d142f0c471d66eb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.5045430251202565, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011561954965856519}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5056119722073757, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561703928784327}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8218739ee41182ba65a3770836aff24b56f5d56d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4580438268305719, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011521653168224733}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4756814537680385, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548748301487319}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..42c9434faf7bd88d762f498fd8561db4f62bbeb9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153405649450586}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153229486915312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..98c85cf5f9e1fd3e289c768c60dd64fd514170bf
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011528611805439893}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011524715486240653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1589441eb3789b2fd1f837fb6d7a0fb1dfb5faf
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4660609299839658, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011535764881641411}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4580438268305719, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011521653168224734}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6c04e044423d60bdcbab991c27d6b5f3c287f87
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011530479981182628}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011528611805439891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..951e47fee3c7dd291e2f326ef117f9e6c27abea4
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.49706039551042225, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562232421541939}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5146980224478889, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011557435464292921}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eab0f835319eb699f825f10338d11abb16d044e7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.46980224478888294, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011541325320336616}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4949225013361839, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561836054238772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..06441d32b0e6bf54a86e5df53b084b14401414c9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4548369855692143, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011515167912227987}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4660609299839658, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011535764881641411}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dde919d07f3220ac022d9caf7f54cbbe4e39a2ad
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4580438268305719, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011521653168224729}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4607161945483699, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011526690316014587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..658b8d7d518d25936393c41bb8f4f37b604f410d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011524715486240648}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.46125066809192944, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011527657726586461}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ed7ebe52ad0b64f9921faf41a6ccf1fa3d616ce
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.464457509353287, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011533182338113984}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4548369855692143, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011515167912227987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..58679032a50405bc0a8c574803de46b71d209855
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..988719df8973e37f14751a1eb3a6a7f1b19135b8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b3274859eefce2332c96a10f27cbc98a367156c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5218ec8838e361d52dce94995a72226b4f60e1f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7ec1b0cd5e323262761d65d35093ab836d0f7d1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8fbc799b1ba1e0216431eb47ca3a3dba2af76d7d
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..228ae99e8dd756a57a182922be1a8a856b4a5781
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.5050774986638161, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01156183605423878}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.51309460181721, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01155846638336718}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bca35175288eadd6917962022256ff675802727
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4660609299839658, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011535764881641411}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4804917156600748, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011553628196999307}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..694908d77ef058c77bd7e2d285c4408df9949622
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01153405649450586}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153229486915312}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe5dbb93209e4ad0cf14a86875b0ac8234466b81
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4665954035275254, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011536599118298178}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4575093532870123, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011520605695184077}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..086c327db24258f692847db06296236d646a33a6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4548369855692143, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011515167912227987}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4569748797434527, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01151954486592806}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..21d868c902bef78faca7736b08e41b19cf6644a8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4575093532870123, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01152060569518408}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505868}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..93d0b33fc795d0ecb997344f4085bb6694ff1c68
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.5034740780331374, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562153149168303}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5189738107963656, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0115541041740197}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..348e078975eb2a17fc43ab11af860de0ffc9fc84
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4660609299839658, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011535764881641411}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.49545697487974344, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561954965856516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0eddf064d7c8c60d01216825af479358f25359aa
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4489577765900588, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011502027057558893}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4649919828968466, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011534056494505868}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd2b74cd1b7085bb4e7a5756e52357a3c45a1f84
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011528611805439891}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4559059326563335, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011517383123961536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3f2eb540d5f716b6473d731d5a53e7589eb7793
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.46178514163548906, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011528611805439891}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.46392303580972744, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011532294869153118}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..67888b192b9d98ba3374cb161b84cb60f1703940
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011543509045585206}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4607161945483699, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011526690316014583}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2678865992f3d16f012cdf1fd75582e27d8653f6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5667870036101083, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02982676408213827}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.47653429602888087, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..61e706901017f08b7d55b9a3db4365a1ef6f9392
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030063300411902652}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d2bd879c802d094b48a45760f2985810f6c37b6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f0e6b54d5cb80755f5b1b4cd1dff31ce069e224
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030039730592197812}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3d8bc0a57ef2f6bdcddbb1b977b0947cf167b56
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5523465703971119, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029931070362939526}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..10e021f48857c4930c85724e220764f97337e6b1
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea1567b0141c6d3a007d70406f3bb39e9ec715d6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..65b394285c8daae92dc99b3c1cf43cb31f88ac0a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c31e0412e36633ed3f67eee9241f1e6e767083f9
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbfb1ab5a777addff0fc167704a299fb40461716
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529113}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..69a4ebfecc0c39324f1eb5d3395ceaeafd51b8b8
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5451263537906137, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029973636495415252}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5487364620938628, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029953149241808946}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a83dadca6b9a8907a2966a91a8d2e2e06c1ce96b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa7856e0d16f853e50cc432ed18dcd5ec7218744
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366422}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..be76ae200462d0afb8057ea96bf075844ef4aa2f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49458483754512633, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..16dbdd18fd52e5c564b33eba2b3c6fa7a58475ff
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f50564243442f9dc70eb0d05222c69114f62a64b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02993107036293953}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02993107036293953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a510a64d950664018df8a28a0736f3218795c34f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529117}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca61d0a71c734f36cc8b014f4c0ff599cfa65fbb
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5451263537906137, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029973636495415255}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.555956678700361, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029907396333795997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb9994273e748a1e47d37e8074678d62eae17ae5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80ff9c062b808d5c06756ac321fcc68678a58926
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.48736462093862815, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7c0d0c0c1ae02b80555fb5ba5a3849184d4d66f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..dedd4b9dea22a24cb0ee32be194217aa051b8796
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030052303463143706}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6eabb4aaaa5cb48b88456ce67e0e84988e7dc5b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5487364620938628, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029953149241808946}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029992535385373314}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad78a35acfcf86428d5af9f01d6992c3df36d20c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5415162454873647, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029992535385373314}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.555956678700361, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029907396333795997}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..145b4397c9f59acfdef109712c2e8206f737a09a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..821ac56a15a6201c66cc32214446cc7756d6f870
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8dc50340ee7c90e85c8deb5e7a9bd7db2395a63
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..47ea696955184b1c5bb6c0cbc79e1cf92afec977
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5379061371841155, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030009848912529117}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd40291a59052a3e2007ae2d86eb02311e3bbf74
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5487364620938628, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.029953149241808943}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.555956678700361, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.029907396333795994}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2274075025f9ee2f1c13fe15d4e2b066c78d717
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02993107036293953}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5523465703971119, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.02993107036293953}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b28dd3b44abf3a676c72671ef60a58686eb07dc7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.49171270718232046, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6003bbf76993edebb1158f31bc6535e5783aa328
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052271211616438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f238b8c180000ce2e1f690f967b75964ab3c35c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.49329123914759276, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051220692330346}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..37cc34b6f4748a616af62e011934f1cf8a08e465
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915867}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330352}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..19e72e646563a591b907a26f99916a367f14e98b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045826789783654}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014045826789783656}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f8923f34dced1d836f392c29c562440c427e94e
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_Replace_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5217048145224941, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014039239216484624}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2875e4e817691316ec9409fa56284a5dd57bf1c5
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.489344909234412, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0140492945362904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..38ffe03f77ea869d32171ed0afa8d124396f7826
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4877663772691397, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014048278820405616}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4846093133385951, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014045826789783663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a926ebae622cb3ef5eab6b5782c18eb662e09b7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076906}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f10be884511df04e1c73cc0a1977995d0171b70
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5059194948697711, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014048278820405621}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2a9944d05f0f085ad5e068ed52416f1fa26584c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4988161010260458, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052446290529019}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052446290529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cc81e586e42a1f52a72e96ca4d88fe27f80445f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_True-or-False_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5027624309392266, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052271211616436}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5169692186266772, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014044390401612976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..31e0eeb6f84d3d89400c73850fa7269dd250845a
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4909234411996843, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140501700944977}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.494869771112865, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051745961790516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..58628f24dfd6b5c048bc23174d5a9651a7b1d0da
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404771839399767}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f4cc954d8695e76dbf6c4479be5faa7c502a0d6
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405090552122858}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5491ed472a9d8ce6d354be52634ae15b2f785e43
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5153906866614049, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045826789783656}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee6acd6b081a54c482041e5c873a8efceaa7e6a0
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529022}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..67c0f28037fe3270d806a86328c652851adaf18c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4988161010260458, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529015}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c1ce25c9521fa5f1365c758df5f728062751d36
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5217048145224941, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014039239216484622}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367589}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..317ae223602dce286195d743e532ecc5ee7abec7
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978594}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050905521228577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e9476a8b430be295a63a73d461dfe69b62bef67
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140501700944977}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e63604ecf82b95a188398ad72f9df07987fb943b
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5256511444356748, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014033980956108557}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5224940805051302, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01403825782405988}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..838c83dcf8cf1d51b633edee2d009b284b350abe
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404827882040562}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5185477505919495, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014042813708888378}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e8beaad7399b050b337a13ea152fcef3d48333f
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_stand-for_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048278820405621}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_0.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f377e8616c4f2695e76c0521e38da95a93f9274
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405237625922564}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.489344909234412, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140492945362904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_1.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..95780b87df599e2effe542103a4dc0a1e9bb1b26
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051745961790513}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529024}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_2.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..89fc548cac99402154326044de6a57ababe73985
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616438}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01404827882040562}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_3.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..04141f25121676c73088925af9cdfe17dff751fa
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076892}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_4.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..62f6dd3800d76893aa0ef34d3e816d354780cfdf
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076906}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405090552122858}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_5.json b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cddeeceb76f70463050b0d4cc1065c09faab43c
--- /dev/null
+++ b/4b284b28boscar/eval/agg.4b284b28boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052446290529019}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5090765588003157, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050170094497697}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a1121a9ce15297df556ff8a2c7b8a00cd231065
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1db03bb42d6a6e9a86ef688cf71628bbe330018f24cf4441db89805dbb37e06c
+size 3966297
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..54f2bb07e26be0b6c2cdbe5d783a2773018658d9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fbfa0c8da8c6c1912fd23c9d40c670945a5e8575eb0361ac6ff56c9fa5cea12
+size 9432598
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b7543a130860950aaf92123522c2e7ea97397c24
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:673150ecd9cf9179bcbf06b648cf7b4ac8a78001c8db0a9095bd0a63deb58270
+size 5564708
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2aedead95cde5d18ac15ae64f24c57bf93a6c2a1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e984dcbdde758a8ebc8482e82332395fb83af82611263745024e6b825b00f4c
+size 6464202
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e31906e26335d1f720661a44aea458ea5942017
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdd26052ccd97a6e4d60280774a9350949f1b8a56c26f2cebb4b01022a19f720
+size 7349938
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c86c66b840a5f27c21e416dc7bacee0d1effd84
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db9f7cf9cdd47debf6445f868aaa0c1372073e9200a1242daa780d41f88de3d4
+size 8202213
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5fe7076e5fefa297ee2116c9d6b8bcff558cc068
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb0e333602b276384c0f9ff0044c08467bf475ff59c35bb2871aa79ab933cd8e
+size 4451150
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1fe041b73eae640a8ef3a5afc7d43b036e601c9f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89f983bdbe5d231e0af8ccd6df867256580d0b7d15f5e925258fa54b8af1218d
+size 3403801
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..49ddd18c1d37e7ead26fb4302fc48b6cc453fb05
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9abf34145217615b9bb3a3ff3eeee48534364cb6a9580863ed844103e2aae532
+size 4065624
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0c762c65a8b1e7e876863e66051d6ccf77228d21
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c82d739e1bc3635211a1d039b46266263a4ad49f1fa4284b741032340c5edcbd
+size 4785021
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cb9083f10d7fd522eded61fd7e09ee88094366ff
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4de604ce9825771ef1f09f6592722c52706326a19fc8b62075aee094f549fd50
+size 5528331
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f66dd02bbebc6d53f405bb30621389d75ae77455
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:674371b3114258cfa8b70db93ea8ed0f05c585ebcc38f8a767c79cc362243a12
+size 6249831
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..088121bdf85dd1108352f6c25ca1110c3e394cbc
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af1a563d7b28223f9d6e5b58b7372706408cb0ab744c7ea0929e4b2df06328d2
+size 4459344
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e47d7cdc2248eb32db78d14bc6efb7ed3ee13663
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11d314b004cdede061c691d70004239a847e248434dfbcc97f630c16216f4eb9
+size 3173928
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..634465efacf993f5c620335c75f007fcd08f1439
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e02b586609374d71348f727e9ae6fc9ce6d4b5508fd9a1c7f47bbc4a3a15681c
+size 3755779
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c67b82b4894753cd7a6d41a1be747c556ea69eb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a40273779224c116b5cc86d25be62c040bd07ddcf040ba3e270d32429f3cc28
+size 4414615
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d93914e0a996e0177d912061ebbc095f448541b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e94e0d433e9ceb4d80d6c1f3236fdfb9b1f5fd28f9e47307fabdd8e6496e5a8d
+size 5083721
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..907234db14c6289c2de2aec847b0b5e5963fee35
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53ec027d6f9f0b539320f091dcafec45dbed6d4a18bd578e9a35dae2fe84cb26
+size 5758825
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..160dcde995b6f3881e42e67dd8487fb300be3136
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7b864f2a70eaf5d49842b1425dff23ec58ccc83d1ad778f64327e7245a389f4
+size 4027982
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..18132ab7f05e0e7faa8c9b49211798ab7c1f9008
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b318baeb94e6f20a5febdcd55cbae3615d7b660ef458f2c09c8efadbc8c142d9
+size 3850431
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eaf28504f3e55a3ae9fa749a4660c5a51f3b12fc
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e97a0532449b9d3bd5cc76783e21a38c9c9c76e1b1b069096f67a71bafd9577
+size 4633117
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..052e5283ae5dee3399c300686e401b54a5a78499
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef7f9b95563b94fb372e9cbdf10aa2efd1c0931c9456ddd872e1b63ace0421b0
+size 5477778
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..58e64dfa15d4313adb1d301afafd8ada255e7fce
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:609c16a1e36a5685035fbf6041f6c471a51d377a34892b6a0376604aaf3ab08a
+size 6332205
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1dcebb31291391efa98c1321b00473f75a0b0756
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:472e44f25015909349d239dcbddfac7d6a00e4d3768c01307125196a42d5f0c7
+size 7194248
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f6c504a4765034e0d401824d2106692c237d560
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fad09ec7493ace16cb46f7340a908a555ebb23e999af3a9cd8647cfd59964353
+size 5109164
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7897e12aba03bb4afb57d102d288055e45b23393
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38fbb309abc4ec59fb774e56fe860f241e44895235ab8636f481176b6db15c6d
+size 5257364
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bed72d2d2c1d6f53b8fea9519179e309f9ff57f1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5230f981964c756c21203d8bfe56adb33c7f19cbcc0cdd21ce73ff24dad3084e
+size 6528288
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..40c486fdd5846f72bd79d7635f61323ebc0130a6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3938dc8262bef04ecb5cb0b62f1341f241b5b7ce9eebc18e3fb2ea2a9578bbfa
+size 7818493
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9f31e6408a3b5e5732ee9d738e993fb93cfac5a3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87c7edd9eb2775320d9a0db633c5aaae845646d7705fa2660e7152c3cca8d3cf
+size 9121649
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..687c6f7f3c06bc9a3d3ec13bf813ac00f205358a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07c1e6e16d116468a2bdd7764c24cf5e64a14d85713da06627166fb9affb7beb
+size 10447764
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e3bf5e6e970f4bcfb0493cf7f706cdad9b39634
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1750feba3dcdbbc9f3aabb5df2322e4f4c1bbee8b77453585edb69f3d47f6cfc
+size 7726999
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e4a3b91d62c0cc8e040e29a3aeea7fbf7bf36e94
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:196e6cec1040e51f9d1bf58fe80541b7e1ff6f5bf4f952ae3b3a3146f3b2ca7e
+size 13455857
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00c3f15429e2a7e4283d17c7c268af07a640f8bb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e775b015df506cc5454a6c991d66a2d7adeb19c85332e2cfb86c1536738f5b24
+size 19047598
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..833633602c58f6b182ff74f1def35972fc12f194
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20d52e6064044c8fbcf1f4c25ff92aad64077ed54ab203b691d2e693b9f20df9
+size 24522403
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0bc79e83717d65767552e6b9a0cdea21d8ee5d3e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7c5532008861b9ad210ddd72f9705bd69f5ca3a362105693b4734a51eeb47c8
+size 29831880
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1e7ba588ad02e2eb1b7302445537f8e4e7916b06
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1ba1543819801a881b477f411a23359df1c01e0a751cc0ad85870600f23e9fb
+size 35282121
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d55c9a3ba276f278ecb0ff80f08862c7ca225d3b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b74c4efbf624435712d66d09803ef0815f7fb789e6b497614f64fb79383135f7
+size 7759143
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5268eea6c6e9e606bc11d5fda36024a6087aebae
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f174e6f4b8a7e7b2e4a5903c37866ba867ab7d09feaf89378e5f7ce40abd3024
+size 13306290
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d4ee76f43c247658c612e4e1b4bb1ba0c411af43
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b975b1208b8d549033846b4948c00ee2c2efbffd92f20cb76aaf495a502d42f
+size 18986931
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5824a536ca51b7b8467f0cea6100670f0c0fcaab
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00b7d37f78cad57c84d46b813484b73fcaa30dfef64c6314d1b4a8d84de1f85e
+size 49044748
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f65cbe94007760efdc1ecee3908fdfdd1c43a3a5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c803152284c2e4a74eff5a80f07035ba466a96e2e54df192c1be8b7e88e7a2a
+size 29923678
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..70df5e1e8abd6cc5c598c8ae6b56d628cb094183
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:455102d4eee3e9b886e97e3937131586def5c9c288b077101226ca2dcb9da4e6
+size 35423062
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7095c2d52f00fb69ae1e6b17cf15a58321f29325
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cc81cee77352b2df55b4b1c83d27f96fcf7318567012b050ee9734ded25a6ff
+size 7927333
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9fded3eca13ff0531b9b310200a06400eb6d481f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0377e61b3f1b04831bca1eed1f395459e23d37797aada0e0e269b3b1c63836df
+size 13371559
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..17c1febebf788ccf51b30177807aa79ca72734dc
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bad316e1d08fadb2816c3a2192c05e87e2eee8548708bfb3449ef040d5773482
+size 19031398
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eca89412c9de0718df278ee1b9abee4719dc7bcf
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a6ba22c6de79844b6be544acde8d8c029074ed5efc2f6cb8e449bd5c3218ea7
+size 24585597
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f9ac2a8734173d2e13464be85d382f1fa70f1ff0
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cfe7136a8aec6ae34a13ceb164207ad90c74c575608f08aeb1c7280cd926c89
+size 30033832
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..12d80eaf434974560f30ef34eb3c9fc71e59478f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e79bad3154850b418c33fd941fe94602ab3f02f6a44c5d83484dd2fe4800cf90
+size 35569433
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a326b4d915f17de2ca551b686687c07382a43fb7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9a2cea200839fe740c5f3a815c9e9fb10aae0c491528efc6a3bf4fbaee5472f
+size 7472190
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1dc2308957fd93147ac40651d95f04a405350d78
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3410b9d45b77c5d2bda26b224efef8b885b4528a3aabeaa529acfce22dfb2209
+size 13055170
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64915e72c3cec4fce1acce4817574ad216a176fd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5cfc35ca8eea1b0ffcbec22938437cdcee3c03daefa3b47f47caf9f9e901075
+size 18618942
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..acdfafa21ce913cdbce511a35b1efccec2122edf
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71cfe830b248047b896b6fb3f6efea697f2b4567cca44d083afd9bec594fbc1a
+size 24074780
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e99d26b2eeec722cb817112c1aeaddeae7c10aff
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecece254b2a2f63a7568ce39d306f6538e571841e687a05cd4f66db98dc6e325
+size 29382034
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..350b9071550e17e985384338acd5e4060166ccf2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a21be4dfeccacb606d12ba7bb2c964849f8bc79f593af6eb72e540f14a07be7
+size 34784643
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..371d4d74136f1dfc0bb8c0f8f12dbd62bd9a5cda
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7045e1bc2bbceb13ee68420901c00f1320e730a34915a8d06210a1ae5b98f25
+size 8188681
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..01676da22c3e267e528add1f60d03556bfedd420
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8cf1ae8d926d41398ebd3a837fcb3c655fea77ee5bf9e9e8cc13fce9557a95e
+size 13785529
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..89bea20120c756c0558a4e47269e6633aa61df93
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca5f531af5fb14cab25198cdf9ae93f15c6cf93f3a4a6150965427a4fd958223
+size 19772369
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4be0408c5ca979892a4c0dea84a5ca7580fdc
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c69361b0258fc057955c3691ae1592c160571f9373e2556fff20fed34c408443
+size 50766294
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5fd4684b1d4807ee9dd1c6e850c9d8cd6d966188
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:367b69114f2aa6990a0d3a452865b3e2e0323e4c6b7ca0b586cddbce84d89919
+size 30827785
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..704ce0f3223bf7a48d69f913b9282b6c935b17eb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65c81b94ca06f1c000b3a340ed9d100f787cefbb51cbb6b25043db47f25d972d
+size 36467474
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4ee16ebd9878afb4dd7ec66e8c00af9b0d312bb3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69ec088de0d3507e569ef5fa20e7f0b9bf8dbdf687fcd0e23d8bba56c3b7a156
+size 993115
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bdfdf0b4a593fcdab8d58aab0d0ca1fdabe9495d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81d7e8a3111764eb8c008f61252e2fc7fc69d5780116b3751848080037d83e1b
+size 1451107
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de592f02477f435bcaed9702fe0d181a04a08e38
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6318a26cb86c429fbee715da7e60455e2088b8f6f183ac1c5988faf68abe398f
+size 1910429
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..99c09416de0e4e5560ee2d86cf2c014101fd3df7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33bca6747f7e41f7995251884af463d3c92cb5d2d937d547bc428a4aeea88017
+size 2367849
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e2a5a09318e25cfcd22c363aeb484d36c329abff
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e6cb9d46f37b0cc81aaf6743d7dadda8083e3a4ec3c3c2a16c18697f47df7b7
+size 2822617
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..73ce76d544cce32b7bf5cf3d53bc290b9ac598fd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:410ce81b7ff4294ecdeecb225d7d03b630278f2619bd006a1b2195832aacb7ef
+size 3279279
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..161f48d8278366e34534c250effbdf4203ecba75
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ff049958ef7be9484d0b68f10227742f95ac41da7acec486ec6d11ed735f076
+size 1203119
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bed26d4459f1598a0a8e7a0c9b0c61c4cbcc575a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a88df7474e2f55cc07e4dce97d5244a389b62cc577fdd75b47d6edb7657b052d
+size 1753793
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4e6ac2ba382356dbbef96257e8b3b0cbdc4a35e8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea9f37e34854d3b4d8f1483c35a6f2bb3355031ebf372bcd31984ea8cf2392b5
+size 2303682
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..649cb3d519830289305a1c3f6f8315dcaaa18a23
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a77b2d79109d83365322af4f3a42d142c2204b9d5aa1f390847b0c6ec56f2e0
+size 2851932
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..31eb9750dde7d9a90e46ede198bf8de18b4e3ca4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d912aeea050b78bb026741d6fdec09ea90cc82298b1ae4199355a8d76553876
+size 3397290
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..08b639ede45494afeea6401f397fb2885709af7c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75441b86b4dedbeac244a0c7d55f421acbfd65e6b9c31a8d9c19f3932c97861a
+size 3944729
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..942298e361b03566d00bcbed07027d099b66bb45
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dae8923042fbd4f2648c5ac1f03a362029717ddf78249af6218d2b55bdf1e53
+size 1007921
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91
+size 1478640
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bbab0ef1fa2a369c39b6d9ae552a97a13fa6c226
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:501f645a98952714974889ad8e34ddcda8e70d77852b9075e2c55483f6064239
+size 1949766
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ad2bfb95317ca3ba787998e5d239ffd0abe19701
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19ebd03933287889715006d6f8999b1812c0e1f3c537c4ec3eb81cfbf6e93077
+size 2419262
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..568c10038daabbeff8f72e014bfb1f5376c073f6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:479a4048f861f5570842172ed653de260348568b2b8d089277b83d4f5a186c1d
+size 2885822
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fdbafc9a775aae40662a645d8949420d73691db5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e25351d22355820098f08feb3b8118a2d1c643bcfa40209d2e7c3119025f3cc0
+size 3354400
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d2ed6c2d70b8a9802a291c7f6a23b3eaa7ed31d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:734083a79996c631149d4dd8245a00cd486325bd2a084311d3f99c2fda44f2a1
+size 1160116
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a0ebd8a661b432e589c9f0d2bab84ea6fd0fe03
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0746e451d4aaf71433f73b1832918974ba296a92e0115c3a58aaa8f5b03e56c
+size 1668611
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..69119407ef73f625c1df8cf4ce2be534a63b8dc0
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a85b40ddf6379f9bcf3ecb7de7acb758e8d597ed22b4f196aae3ed1d3f6ddb9a
+size 2176989
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20cbfd66c74cffa8558cb99aa6b8fe53e77750fa
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa402d9eeb886a49aa21feca107da0b3f9835f0dbe611577a8a7b9f6ca555c3d
+size 2684069
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ce84265dfffcd3d3f29f90f2103c7ef1be548de2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:505655f0d7fff82e9e4aef91de1984d7e3b18c966aa7bccc2c5a8360004834e2
+size 3188160
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f906cecda9e1f986abc7fc1d756be1b6972b3da5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:651cd5de0785cebe34db334ac292f0b9243e2edeac626caab67ef0fce493cf39
+size 3694259
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a4b3714df69d3efb124a54988fbfde204d39d9ce
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19012fdd78a508242e6e07a1ade623017069118febeb8168866b8ecfef0aa95b
+size 1026750
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f1b2f7514c28b3d0e60f7619687b86e35612ea5e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd21b3162cacfa57437728e6abdd185de9efb3bad975aa59294b76c811a384f
+size 1503640
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..36a43c6d3f5fd138fc7148b208ebe6a26ea16d67
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1a827b745c66076f2cf5247ddc3ff98cb090b56b5c979b11fd34852e065910a
+size 1980511
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7aef758834ea900403fc990a827bd13adfde6482
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4aadf7165ef693e34eede247238a28111befc0b1c49f62ef2d011bc73aef89d
+size 2456024
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7225e162bc9926c06bde69b087f30d5a956a6093
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baeeb1eff8a660d14663ef1c057cf6612b05807e074170df7ca5aa7024cd7f51
+size 2928677
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fcb5a405b6b7454b30e9765e0ede19a107c569a2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r1_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:581efe114db1c8687b9fb53dab4e91e9c64c69cfa0723fe1557ffc46f4167563
+size 3403240
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..473c1605fb69f5c55232c267b2234244c1f6f4e1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe83d742a332a8b52658bf0fb1ea4ff1cf4aee86a21052d3f6bc91591573844a
+size 993515
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca5b2bbcba0c3d36dc63c76572d0c6d48c1e4a5d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51a4e08f1c6b2794983a0f23b1cb77af0ea1bf337c058242dd783fcfa7a16651
+size 1446283
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..51f1f44e4dc5f7d7c53a326d0ee8a6bc0015056b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c307cabb91caa08d188e63e1e4053a5a4ac20b71176b8d9a3540c2b959db2ba9
+size 1900417
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ad53109db99e624d99e8fa92f00ff80107077319
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01adb4eebd42ac9dd478166feefc01da861e71f43e8653500e5e279305c4dcf0
+size 2350485
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3ccda213a97ecceef3a6e5d40004ebdebec06019
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:577951be591144a2b9934164aa1afa7daedd83b3de76487d03b95dd99c77e5c5
+size 2800627
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e37b24389321af47be01ceb66a16e7333b869db4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0dbb5e9656579bddc098ac3ff7a33cf8dd641b7b755f857584d9d2673fb8f79
+size 3251578
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d9f0b5cf66884104a1598791371c4ca4c1c2f02a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c68748ecabdf9497465bd9f91a4a43b7f009739f6b4ceb8571cf50f892dfee
+size 1203517
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a9a80da93c9869c44d155a1a945a0130dac5494d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f18be089c879a2603363212f9c210f0249f9df9174de0d1140a774d476b6ddeb
+size 1748699
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..882d8ca092b2c278603db0184baac20cdf667304
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be406510443bf8914aa4cec98cc92290522e784843b80b64a223da03443c4452
+size 2293654
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..858e4b1eb7b621c93766288f4510660c85d8cd2a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7ed8ca48362c4ee5f3354eba4e93623b58f43de5b1a6828b8ed6c4fa5a6d9f8
+size 2834428
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..df60a29792deb9697f6aa05e233e1ee9da1fd998
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c44a0c4a5095ff660eff0c26dea533711aeba0496c0ac7793a50e4d0499bc222
+size 3375194
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..763793d0e6cfbe45b5e844efbb1c0995bde8c736
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea21fff31d69a6b219370e34c3ee73c59148a7f57eab9833804ecd1e1b98fd7b
+size 3916997
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..358ab8ffc4957519e09d1634db42f64b7747ced6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a734045d9a25ed47fffe2586ae58b1c191e4b3aed315f69944843f76eb37caf3
+size 1008290
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5
+size 1474064
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2a1271a2906480bac6caae9886a8a422e1205fd4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3432cbc60282e7c07e5e5e405c4d5b28be6a4df68e009b92d65d34a699ffcb1
+size 1939890
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..88e40d7fc5af653e3a6572dd865be0bf8708ca45
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc85ac12f15cf22c5014bb2f854a7cd2b4ceeaff274644e806c62688ccf5092
+size 2401929
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c5841023325b5da253f8d0604232b54574264c76
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e0b890e3e9bc811d309c8fc733fcf349d9c52e644967c4b526f185fddfdf0a6
+size 2863819
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..afe3e0bc834ca1facfb20724c9fa1227bf8bd477
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68b93576f8c888410ed454da08e963d8d92938ba175919b92589567ba5079465
+size 3326679
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..48193c145dde03a89ab01762664e739cdb518d16
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d52a1030db350bd54a8ebf72b10c92a17d39ca2d4782406227477c3203d5ff1b
+size 1160520
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..78a17806057d4a35ad5a4f5b68e96a2a7db303c3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb299f27f8617417b7ba03ca97596a2546763750bbe03cc951fb713bd04c46c5
+size 1663765
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..becdd36b309a27b4154f9b36e751d99b67e922d2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:204136c513ae536ff410e98777879a06cbf1c178a38ad1ab96c911e182a0e4ff
+size 2166606
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cd8b963e5a7a3d63e01c6eb52d72ae0c7a260633
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7235b0dbdc81067d053ea89a6c11dbe3aff24e17fb4468eeb081acab136136ac
+size 2665971
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae53cf5848b58b4ae35bc807d07d9137a7a71436
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:505ed6a4ce712404d6e8d209ce9a0569a8d7414393b4bb2415b228fa3e7d5018
+size 3165205
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5dec53dee927f2f23894888cf2ca19740f4d2531
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04af9cc7da00dc75af6013532322fbb1bd8a9fb45a856c97eef9f6fb93df4d8f
+size 3665550
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d1bc5eab6aca6f28c4db01ecfbe2c41dafae14e5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6aeb39deb948113eb8f7b1375b18373ebaa7fdd7a667150dbb2a9d5c16643d6c
+size 1027141
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b
+size 1499064
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..96c989aefc0c65a9e5bd1b02c3d07e639e08f1b0
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0556b95ce5b3f782ef53d34b49c397673148d566332e9681eaf99dfc209312f4
+size 1970652
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7ad4aa6dc66968a27c0fb000f1416f65ab22e414
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0469e2181e11923f589bc6c55da911fedd5853e880851fc17fd1426ac7e365f3
+size 2438760
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..099141aea832ccadc136835cf97b7581b51fcb6c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbc3c5c5c2faf5e2d67a8ecced04f9f049b1a778044637c35ba104498e7e3caa
+size 2906685
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..33be279c57ae4ba48a3dfd6d7d85e0805af8a018
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r2_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12f40730fb0822b9bcbf9452d2679e49d6d3d5353d26561e5d0095501bb810a2
+size 3375595
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..56d86cbb2b4d8e835ec991804429620132fb28e5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b71935bc2905801e850f027732d2497baf7d05a9e65a11ac8fbb0bc0fa49e82
+size 1169169
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9296b150c3f2967161c4f4b299f6d620d7b5786e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:141c4332d5ef80573c94295438efd431e45a5a6efed3aebcbe282f1a98ef6a96
+size 1698037
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..419dac699c08923d62e4061fd818ec382cdaca0b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ea051aea17a2133aa666306d96c8aefd488a25f749e4bd035a99bc4901c9d8e
+size 2217478
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d33c8fab497b56cdd2120eb946e76f3b2c81841
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b32c8e84f5b3ff2f44ed18b84b5e1b260d7f0b34704e9606bc5958a4f27b5f2
+size 2731240
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8043ffb8dc62016c874f6bd3f1a148533a712417
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad20d7659dab51aff816ebeaaeb90502324d70ea57d87f834204eb9d50dcf9a
+size 3249145
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..26afac4653af35069bd53406924a382af0c91fef
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa63cb3c8b4f9ba495656718a691f06b651e089c15344a8c1e5e63eb10ab5330
+size 3778054
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..50cd2c44a0c7215ac3bff82c261d04420b7fb5a7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d1fcb53c9e5662bf58cd33b1f667f82dd4445fece5b27ca83e932876ee83dfb
+size 1421159
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2510b9864a842732e14a7e58ddf8c69a86d67088
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4639ad76dddb2d6924c0e00a5ed963a4f4f134972fef6ab7a460b5374c5e0797
+size 2060757
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9461691c9995396940f39906d0bf83f1091015a9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a32cd3036034cb700d42c24851ef7de931b10e9623c09e5e2747a822f5b79db8
+size 2688665
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..65115e49ed2a98c9a0bcd2307d9a26bdd834856e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02559ad5f5fa6254bf6d0eca1d4f16fdd7d0114dc6bd67384a56bb2b85579398
+size 3311138
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2dec90f3648090969a735ad6cd6652010a6c494e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:383cd3caa5a38f60e1dabf1894c4b7e3a67fcd698338995be3867a8ba24cc5d0
+size 3937832
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b5b9c7c42ae2a8642a95fd37b3d4bc51ec93e08
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60665181e0191ffcded5935fd6fe4a2d28e2754873d2190983ed8e507943ae3f
+size 4575702
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3503662080d6575c3d22a0e525b0ba7a488a5bbd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14892b24000c8d8742b60774bb42f8dadb3f32282377080df95a4837bc73fd92
+size 1186868
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778
+size 1730743
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..521c1888260e706af14ae8fc6f3024f054ab0f33
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:034e44e9de794fc807790489256b4d57305cde7e12f624d9b5566b37a0801c66
+size 2264657
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..92e150c23e039310150e1ee242c3c8a8d6a725bb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8593aabd2092ba38a6b36e550403d62e3a37c072aa7ed38e1488901fc6d7726e
+size 2792457
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f2b9ba032bd83e38f8d7530f8fc91cfb921a40ac
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:464c0234318e439d7271aee18d96f14a781ef21798920ca7e233ad7ddaed4248
+size 3324622
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab32f0184b0af012d31e62dc9cc80ac3b5752ef1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f07634dfd52a9ed96a260d7697635da2dd8e742bb21f7060bf86b835987aed9
+size 3867700
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aa89ba589ca246bb0cfc9b106c1b9c7cfd77b3f7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dba2aba075c6a9787e93bcf33663d501d1a9dd42434129b5a23391e2d2fb66ff
+size 1369783
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3f91edc80ee4d1133483d75dcc406bfce8364b2c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d56a4ae168596693e075606913325b8eb10f06634a9eb7cb1410ebe8da4138a5
+size 1959069
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f46c9fa99a3ef1c3b7619c97596f8f50203aa181
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a7c95abcdce4379df3eb31aed23cd9c8a9be7f95af70272ac368c01e6415c75
+size 2537542
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14df815d29bf83a27287e744652485f2008a2096
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85716dc772cb363ed3952c801985cba55ccb2d57222241485932ee0c66b65f06
+size 3110633
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0b8d0638bef889d9e8661d0a7e6e48c590e87a6e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d3bf3d2171d5343a5e466ff36fee1777d56dc737aab52db6a5bae3248dd60a9
+size 3687950
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..01e40415e168b632214c2db900b3b8fc3cbbd2dc
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51200b82c507ef4317ebfdb214fe87a3bc4014fe3ef356787360535d8342bd08
+size 4276197
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d24cec10fa57a781751743dcb332a1db357dd7e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c47e42b2baaca9dddbd702e6be2de64c8bdd6a2616c2e66f496b71f887aa5c9b
+size 1209613
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39933c2636ac39ed7b9cdb1c5a6051e44e115f74
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3f7d1d6d6548fc5354424fbd538b705ba9ce9f7bf9a237569f921608baf3021
+size 1760743
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5c269b8502913120360123e4bba5a161923c3a5c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:918ce5f9dd943f7eca7b156f78983ac607eec165f88e6ac0fced9d7afde89565
+size 2301477
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..594d685cd2649022b7af3fe3beb3678a943d35a2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4104f1bff86b103aa2a38b8ac0ebcbccbbee731a9cfe9dfb087cacd09455fb45
+size 2836615
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b7866a0bca6713f99ac109c7fe81593070a1100
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32af76626357ac19a7256b5397fe57e998efe44e3566ebeae438684d368854b8
+size 3376027
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ebd64c5bb035c2ccf343658e314ede5a6a25cc25
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_anli_r3_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ed517cb4cebe650209ad8dccb9ca423bc6fd62d7e01c10f594c8879ecf2297a
+size 3926319
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b804725f3355bb08c275295586df20bb9bb9e885
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e5c3fec6927e08e1dd34f751b8a26d335b635bec6995f19765dfd673091e2e2
+size 1216777
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2bb8bc0dcb70813743852dd786584ae14a0b726a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:219a50ee311da160367af2fa34770b622c75def79a1f0b491247334907486a76
+size 1670632
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..adad83a5a09d1c87437bf96f80675f9566ddfb6d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:020ed3fb4bbebc8e2ab1baf7db2ab3ffcc1af63f4aff0764b918642f17f4e644
+size 2120084
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9430feb86130989cd65c8887806dc05b7b2b1034
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc6c7905c6c0ce1ad84881809a9206643d8572484b1a60a6fa5f9f3bbb572a8e
+size 2577615
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c75f924ebae44eb20fbeb20a9db7e2661aaed656
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c966ec2188ce00ec2a9c63e23844c2dd5af83ecf4602d4eacfee89cefc78d56b
+size 3027517
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..22f1abb33b95ad92f33339d0e0e74ff2ed7d1452
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80697b905503217c850115c1f74a555950b4570b79fbbc816fe989f5bc5001eb
+size 3479076
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..63e2a6c6f5a95b038ad296c717133d28e81f508f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9eb21bac94d0ab65600f36b6c25149da09866c9e48428dae86f44e015d0b1e34
+size 1458822
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6a283ceafcdb949b0cb62bedca18ad2966fff19a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9737fba2474a181b7402ea08d584af34975491422894c430af8a1bb0cbbd505b
+size 1961456
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..50ddec5395cc8d0203313d6545c8f83be7e9030f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89e1977cd896854b7ee878f00941fb89f21c57186027e69fbdc5ffd3b99a26f8
+size 2458284
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2d8e092a670f8f9006804e833dea65f59747d8ac
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf4e2ef649a3db3e824bbe8b54241aab47364f3b236f53f090f7d737be366d44
+size 2963918
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de78cbe2a803c828128117850b45668cb51395fa
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d8ff7ce1ea9185f905479a69d3ad99ddc74a4c1be3b149eef48eb99e77dcf06
+size 3461753
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2fce768fc0f5dbd59283edf5ff9caa4125004ca4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67854eefbbabb7bafbbf3d9d38f8418f11e148186ccaeca629ee8b657590326d
+size 3960999
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..542520bff32331b73c16cd9da6e361d07aaa3e5f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb1251d334bc0c334d94b1957f750efc65c8bd893cb8759a8090b092c519c89
+size 1506169
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..15831622d54a258e6f09d9e486d5789e137ec672
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f38a0170005219d3e9ce1476a7adcf0c25d68342e7cf22b7ad3f26b090b6942
+size 2032559
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1ac768837681c1550f2e0bcad0b92d7ac08deb00
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af630196392889e5bc348c749fdb27d298eec8e4b572b43b77b1ec6d16058124
+size 2554211
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8dbed499ebf1ed7157e2db792477df719c1201c8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17a6f9576b0f80e1c22042c9575786c033bdbc4a7828f01783a7ab2692359451
+size 3084882
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3afce91e9282c68f045a054c8326a498e8ae7bac
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8901a5354e0599720568547d4074ddb83a93ed5f9f9ad19a00e4b311672963b
+size 3607865
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8cda598fe1420d5441e9d8e62f11ab7b486b6603
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edb9ca6fd0690d16f0f948bf700d712f85a8363aec58406b9c9bf7db424b79a0
+size 4131591
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d2344c9582c6cfc8767ea11e7579d27c8c7084b5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4bf363718ad29093ae8c38dd3dd58a60c0201784a11fbee73baf020abc66eb8
+size 1202714
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3bb0d1805acaca224436dd1effc44c97118fa98b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5db112aba61a713f820200f8198caccb524261a22753c2a52b7d2f40fff04e74
+size 1638992
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23fde7bf026b48e52813db39b93a61bab4efaf17
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc057e66f41ed9ee3e0e285cba4cc7076f0daa502b03f686aa5d75fdc1fa7440
+size 2070864
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9ec96d8541299d96fdca26f3294a5c0bbf5a5f03
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70fd4f554701c82a0b20c6f8c07d6c8fa32975e0a6e132a24e37701b142e71cd
+size 2510815
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..67006f4c18a27b85a3e4dbc3f269fccae650d545
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a95a428f16d1e21d31c22b645a6be7209408802218db4bdacf3c97835014778e
+size 2943137
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b7cf3118e6bf7d87ee81f2cbbd2e0ea51a2fd663
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f821277c6767c2b77d18ad40214849b187ae0d3cfbec7b8d9fb71a131bcc5d14
+size 3377116
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..18eb6d5141b3972e4549cec644d6222736852305
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39534282920662689a45886702c47bc921a522aac3b65dfee9d47e66a65d059f
+size 1187824
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..56acb52fa373cf2c5a98a90a70fc01b29e3014cd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:201fe689525d76c12ac8f3a1d8f67d5aa807cebb97e2f85e8b3b5d6b734b64d3
+size 1557995
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6797eb637ceb101cd3a0269cfd6129380613667
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79ed6ff09a83f67521b090f0c36fd4c281c33ec69d3bbaca90df32090e93f040
+size 1922328
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c4123da58f043269ecd7d5156d69a79d013824d5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ba4b6845244e8e928017831ec4abe49d8eaae9901733998c27514aa250cc238
+size 2295729
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d9cf3313bd58f90fbcfc5302ca3504fe7d36b22
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:319bb30145f435a5214dd6f7a2a7ab9aff2d7ac50ac5f6ddf9b341b505b0dcc4
+size 2661112
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90d7895419b9329e9c69e67724a2fae4004e6a3d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_challenge_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58aab0e19f9017023e551f18a3a82812388810d70e26bf9a71ee65f988677f34
+size 3027996
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..587e05388b73e388a8450ded68255aee3f30f102
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c527115171c5f9b5086d491228f76013225496295fec0fa9fb97d2d568ca40b
+size 2351241
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e701917ffa113f9d97e4c6e53c908da163e78486
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2b25273eede4c1189cdb32507be13ff2e8da779d0add328589ab7ced35a947b
+size 3174164
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f3ef671b6eef532083f215943c01a738b8aa8cdb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2151547c961b91d43fda8ccc873a4158dbcba14b2fb6c7bdf3766a91b1a9aaa2
+size 4008946
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c09bceedfecba7caec4806a9fbefb2451a66fadb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d70d896019b728b2fc7d518cf7746eb8eb9d1bf52ad140fba8139002987be57
+size 4831612
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8802630a230f4d8af3d9e160abcb664d1590484c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a7e7ea29c3f0cd805bfacf6105abc650e2662327e35cedf9a01fd88969fb2ce
+size 5662356
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eec4bbf970e7329095f143b1841d27580f181854
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:137f5f7d58cfb058194bd09cdfc811d8b1f503a22cde0a4159fafbc9239b8c10
+size 6494729
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6b3e0ca35e1dc0ba9a624a9462de5cea1cc7a2e1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:643e2b2496eb2fe613e67eafc875751d13f6d1cf0abb63990e947fe4e96e6adb
+size 2746223
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9217da3f49db8595245c74154afb13a057c4fbde
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cccd12db4727a5fe92a566c6e6a167d7edf29c8b4af16886761753aadd6efde
+size 3649635
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20e9f83be0c027060a25dad606ca55b1f940f635
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:826baf37aa4c16052eacbec939039e83e7a5c2f55ddcda8751bcd10c3a821069
+size 4567038
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2de665a631a3a909d3d2654c5bd9ceb0963ba65c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e40bdd009b2805c60fe074a6c0e5c0be111fe86e0e1d1909d7af512741c45d9
+size 5470589
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07d0a341d5b6e97c54820c4760bbde60529d99bd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d041cf23421f579be1cd61c7f0af8073e7a827be599dd76a39190d3540395e4e
+size 6383022
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5cec7cca431f64d5ac5508c8a74e520092f738e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:954e81ef9ff00d0b276357a3116883e3b30c87bc24aaf2f3d587bc58539c9edd
+size 7295211
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..38d856faf1d159483f27e49ae6f128ef6ae1ca3d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a135e1b0a3ecba752bcb9403e27adeb61b31d09d9656637cb1d0b0e4d460177c
+size 2843559
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bdea81ca97b5966d04c4f29d3bd27494299921db
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77ed78b70dad2298ff87a13ae3b52676258917ed97286a2c5d25c5e188460475
+size 3795084
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a178116bf4bfcf9db08d90df6c9bb144d3ebb08
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2358b9d54657dc58bf81cbcee12723202432d5d3c3d1f8a2fa8829146344f63b
+size 4763433
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ddd6d8751bbe593a82a120581838f01274e1358e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cafd7296564a6d60e181d90cbced2857bb7c25913d9eef637a94951cf6825dfa
+size 5717050
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c7120d8616ed3780a09f91b6a58853022ebe0ab1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:722cefeaf9738cf61838dcd0de9281598210ece756f2f0f7c0143bc6c7102888
+size 6679272
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d625f764e9a8935c0b7571d3f900fcc10a39c4ab
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46a435de0a25b8c1e7a8850d80d5862344132b5a6eec96e25c4e75395cd1ac9b
+size 7641614
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e17d020eeef720dbf1c87bf669fc17b6333be41
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:575c376634ff365a7320aad85ed4479ac40d2723b6de503e0fb70b552e798979
+size 2322732
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02129b80121eded40630242adceb890eba9890db
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88f81c6eee0e63a61d5260ce4f6c9c03bb485bc7db48b817cf124e3d03eba54d
+size 3110021
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d9c17acac89d506c35cf8f5cad75ec412c08469d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37c5dcc6b6769e9d86cf66533fa920ae880839d80c28cef945301ec2d6a038e2
+size 3909160
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79c54f4c81bb573910c08e9766f016a8d1642cac
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cf56743d0b05c3480e844430328c9d2ed45ce4f3570460d902ea3bd54b2531f
+size 4696184
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fc6928187cb73da334dbf7cf275ae69a1046ec0c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bff45583a8969e1dd40a786e1d3c696927f223878c38da56a4ccc5d7bc4d65a
+size 5491290
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..200078152f611869a038b2aac7856b5e6276d30f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ac2f7d32c998c37d2432de9f09abf37273a237ac7359722cff04b59d8ae3ecb
+size 6288023
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f96db788087519aca94f8289d7fd1f53f5efa36
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2fd13963ffe8ba5d195f18704e7d3c7ed11a9d8602e6a4dc9dd8e7d32408cd6
+size 2197351
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..84368548bac11a4e03fde6c4526df7683883d7d9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2a8efeb211fd1cb8ce7bd64c46d5b91377d2d51401c5d528f3fc918e84033c2
+size 2832149
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b8f93fe1582db9ec6a380d12e3840f61a36968a9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae4ecf2a1e0efe22b70ee32988de7723440c42b45eb1f4b3c2675acde5ece85c
+size 3481188
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a94d9af1316701284abecb5f04d6f271c37657ee
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d8b3e3bf6f3945211d5f509415b6bf2b40c05b45638919bd311fc64dc23de65
+size 4116253
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..129379d31c8f1ffa812f4b1fac0b00cc8a58d6a5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e63e1dcec29186eec9d0fdf91300d70ebc5a42f43ddc8fecda0a4e3ef7c63c6b
+size 4760236
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b25ac5a035e5e28090cbf778f3883bf86f1b7f2b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_arc_easy_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a16c3c8b88d9c26d0666dcbcb7902b48a0840021c126a712e725cc25176531ca
+size 5403788
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d24b88a935437fca6f325ad2c5df6fe0097e155
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65e18887fd40b91cb1d6c13f3a22c46a35af215b226c17437379841e7dadb240
+size 3641434
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9c5fc354de8d02e7835de65ae20eccb0df532dae
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dba5f0f67c20e38db2acc76aeeed0c50369a148d1b372d855f8fb084894d1560
+size 5657171
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f96d6af75499c15901a9b49995c6ab72b25d0099
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44ca1ee48e41bf5b9f2b92f47924b1a96972f4bbbf2f4cb9c54b47cd137c763a
+size 7694264
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0fbefff71154defd3525d4d08b2cc2cdd3f84629
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:761d214be487bafa28dd8d10a613b0b6f1026573b805a18ce3d9117855cb8c55
+size 9728216
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3be99429d4d22e3f6449a930dfd169b2e4d40d3e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b01656d9112fe64f12736f18b3ccb9943e65f3a492c362dd843b1de260a24ba8
+size 11768712
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5aec985ae91664ee1a14a9fd11c26056f789c65d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_GPT-3-Style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:692e129901790579f257aae51c57a03a6e0ae6094806a83d9c8e2fdff729b253
+size 13790159
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da8c5b590b0528d2a85723ce5c27e78034d64bb7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd929a63eb499481f1db6ebd617d1fabc32d73332e5264d123e575484ba0484c
+size 3984656
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b96bf2e00af316ddebaa82c9ca6817777198046
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f9a0ff6f03a2db9a79c9f1eeed5754d856423bca6b86f09bbb2ec159a4ae1c2
+size 6168088
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..250bb98db23350f07a96582c746834f6b8f900e2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aebdf979cffb855b1601defd0d843bf4da3f34c90203bb773619b02182daf1a9
+size 8371955
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b7acee27be235bce4900b399fec2c1d3a96402c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edaee3bee064db099ffe3d4e4b92fefe9e6b9b3f6417f57ec1340a30f528ae1e
+size 10573268
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca5c57f4852ba5b9a504f22315c5a932c15b2ed9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e8ec17447e885190136e03ef4668cf9af2d11f2da685fc6f71677bb21f260d3
+size 12781092
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4c8e2d2800b8a168000c1fb73a6b5d79660f537b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_after_reading_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3041fccf566f546e94646c5fb0a75a90775c4da0452daca0715b8751f3019b66
+size 14969945
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d40d207590fe585e21fa97aee4bb270de38833f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60753d451d8b4c4fbee9fecf9a14420c3d6d35449c9c791d1b72de95f2ca152c
+size 4041656
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8331f76d6f91f217c15f3be47dfcbc03faac066b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cda2a40883fb9ebcb9971fe811e0210346614ef70e7ac7809063665995e659ad
+size 6260603
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6b60fc63298e7be8254ff4d4a00432b548af4c86
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b941211d092ac541a2ff9abfd1e9c5a3c90e37b8e61042f1718d453cf3a759fb
+size 8500722
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..518bc2f98b7e12e112ca7bc58cbc308d0886e5ad
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee83def634e0eccacbe462fb0922dae328b07bb97acc62631d70800f97ffb468
+size 10737884
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7d05606c9d541022c0929f9739e38cf09096e2de
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:136c7e2fe464c74f24c24686df854e4e4c58726238fb2cf60971a450e4dc4514
+size 12981725
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbbad1da3da6155a31f4cfdc4e4f24dad735694c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_exercise_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d1b9d96514d075e4b3aeab3cf5389b42be612fa5eda28c1082820117ecf617a
+size 15206552
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..928c4e1a662e7f928b1997e559109c84057259fe
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38c55254f71b877f80d439e1bc173ea5ac2b4ad5f88dfef57b80d0264ad43399
+size 3664516
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb09e762b603f70687c28430bafd1bcf2867c9c9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e59b1a81b6a0d5e176ab0954ee654eb034eed9b9e56fe73b049235da9d99680
+size 5687692
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0c7b99a549f698f7d282b73fc75bbb24c6b7fdce
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa6aede2ba30a2a4159ed9c919a1f3af55f0bab4acc134af81ae619e0d74c05a
+size 7732830
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b9b9775a58c57bea4463f51cf67c4878a598040
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cc5a81befcd256a06f03d2843a044710341095fd55863d1370a66faeadd4e1b
+size 9775207
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ec81292cf48ab2900560e3e14c45bc3621941c91
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65674a4a456ffa0a2bbc91fdb87dfe34b4259db00f64d2a55b610e8d088c17d0
+size 11824127
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d3f4f4e3fdc4cd2a8b1316143722961e95cc0775
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_valid_binary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ef22036f8bcb4b35e06265a44324b5e25275a64d6bb56a5209e71da14f2a73b
+size 13854041
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fde0f24dec84d0ad36254754573d59d0820f4691
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e0b172c23e80ad420a424039d14a760dfffdbc09921f7e21ae156816bae52b
+size 3862394
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7715374b4348ad1785b39bb4526e8de1da6415c0
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ef386a4fc84b20d97b20d7ecaebf02c38a079b0d953914e8dbdd4b2e89e1550
+size 5990492
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..29ba02e78617b72fa0671b6483e6940459e68e7c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64fc1342a4e5a3883f07258a9d9b793ed1b5e84339a95ad4052fea0bf371f08e
+size 8142171
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e24e275553a9f40d8f2aae3438cba9aff41ae74
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ada35ff2e9b6ffac4844692021e7d7412b01d71e92cade70271d14d2044cd5b
+size 10290374
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a89ba2cd569f03855cafb9057cfd04d3ccbdace
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb4f79dfb136f0450310b14976e9ab7d1df5f9afa8fb26b38e80b5c562eb3ec
+size 12445065
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64f64f3eb21d1c91ee8310bfe32ac32d9aca9cbf
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_boolq_yes_no_question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e51982aaa5e6bb834d27244cbdb33746db92b40ccf1361d6f47ea03161d3cf5
+size 14580657
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d96a99663a32fc932eaeeccacc7462560e2e5877
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07d6126480f6c7250b854a223be1446b3fbc839e24f5a67e46dea97a3fc38418
+size 55148
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4581e9f3779fc56acdf34e978ee9cfe10d2fdc3c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:777a5518f092ebfd7a677ca86ff6571672392e75cd02fa4059dd073316f6f6a1
+size 77960
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..51056748a99a33026ef9f8466065329cb534bc5d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb916f14b7c1c7c764fe4455632e8b49fab1370f81fe003c869279fea37bd970
+size 99590
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f875fd7ef91ede69b496e6aa609033bb724b2607
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41709a39c6cd6f1eefa45c3ff1b01ac2ede922aab6eee687e0a190909ebde39a
+size 120779
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00c927df597ccc12bdb2861e7545732729ccf9c7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77709fc6dc4d4a18a5a961a31aebff063bbd9d3db47c179911fb377e7b9a8875
+size 142909
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..92f05cfa3a5d6934d78e262a9b431e7a00ee7bf2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3888e3c7d12019f2336d8556aeb80e32fa60d94f9bf1e29a7993863d2765307e
+size 163743
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..806b11a9c8b43716ddc929108e3243bee2b45fa9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b09d06d81d146b2e5697d769fd76799731df2c2d70081347d8168f300891d6b8
+size 66218
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..698d6718a156fee489faa7f55f1a189f580ad33a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1693f018ea9662d6e12d9be5a1bc22671125b663ba049f2b536ece6214ac929
+size 94126
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..893e032fe2875d4b74cd1a6dfd03a128be4b336f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6775d8f1e28908a234157f1115d02139e659e567cfaeee698266fbc993aff280
+size 120794
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbce62e2f39fc75133eb58cb30b4d016a9aa365d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8e7970cd7940aa4cbb14ef989d445e6324d5ef16342d6776abc28cb5761c19b
+size 147030
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c05ac58ff1d09d74ed00b51d805964df320eecb4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bc8ee9f828f380785bce45f1e0dc9605408ce1521f2f7bed0200152692d2f99
+size 174233
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..010591948bbc7426108c660d9927ab4e89562509
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27e2b513e7addb295ba609895acb6ecb6c02816ba1d5ebf675b8353bd2b9f15f
+size 200145
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..34b5d7249bc4612c239b039935ff3561fb96334c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb4ba9939ef55eee7f5c05c0dcc768089b6d51cfa0da1391723e41d3ec564c66
+size 56294
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d
+size 79780
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aa56733c3a01f46c90ced6f32314a3b9f7fe4bbe
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85379fe6c1c67e5e7c26542360913878ee8e6d0b33d81d8657a0b49e8229db01
+size 102076
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9fe7f4e63176f9caee574b7828b3b2810447324
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb856fda01ff371603034453c3fc109f14bad55cc4338109e8886d8301072727
+size 123902
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8554cdbaacd21ffdeb91ccdd2ebe59c49c818ce0
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89a3c22367f81cf21de1d82e49f8abdd25db0dbc3675d47ecb9b50dca0bd4d84
+size 146675
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d7181c95f5ec879d26137831a597b3463e81d21
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f64c512e1d253ff194d7048a97876d4452ba588f0606684d1f92319f73588d4
+size 168159
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..129b6a582f617e8514554cacedf4306b5ad5bad8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c5df0811554c4a7f12f7769e15bd5d88d876b9a808f040e23945d220ff275f2
+size 63935
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..65cb8b7d617c5750441a7c92925e809570f9e438
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a124450a046d19feb2e24975cd24ab7de0f3772458171d579e16c101eee16541
+size 89644
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6e0b9da8ac2bd40e258d1904d1d6d6042fee079d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7115717a82d95ab1d9099ef9be21c87b3af7d219d276159e05338f355a6beeae
+size 114129
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d908858bb9db8515f744c8a5a7a646a9a20b0145
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c54fdd85c8be8dcb5d5e15e64eb9b24ee9bf710cd59907dff46bd14c9e48233
+size 138160
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d784f03fb2e0e65d5d3f5988ed215b06134b2278
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2792eea71bb496b501d7e80d0afe4599d125cdb9fc65c654b836b48a5c3d402a
+size 163148
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eee60a48c3227b4bac93947408ffb484f1c353ec
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0573bc56194c4777f8e5d37763081a4757d6d676df0a56fac55449f499b207b1
+size 186820
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..63eb0d728142fd95682e7aa8e0b7ef87dcf7d468
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88c2cdca56923d68e060ee01f13665a713b89e3451a1be6c294e41a70acd24c4
+size 57295
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05
+size 81124
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb3620db88f410cdcbb8d3c07438a95bed7d97a5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a9fffdfd1a0d9397782abfe9b968e67a3b1f1af49e24c5adad087ae941339ba
+size 103749
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e16206fdeb20ca60ce4a5dd4792aff32437de2ba
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:536a2f2df74e10655f66632fe52d2c2572ce9fdfd169969a2198588bf147690e
+size 125902
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20606438b7a2a635559ef50894902046161bf041
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e565853dc3bae4c131fb53c61b2318245c20242b20412a64ceb8fb923969a84
+size 149027
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f0963d9d47340838462e25ef5cd65902cb223d4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_cb_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03f8db6777e577f9389c4e49e59a15cddfa45a75d668446f1d1f0aef8863ea86
+size 170846
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe224dbf5ad10bbdc419c2d3e2efefb641017577
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:434381c4f33159d582219d793c97ef55bffece1d5eb2bd2806cd7cd5990adc8b
+size 92240
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f8f704f14c7aefc80265d9cbfefc302d36b43ef7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ea6a9ace728ded6d113286202c806b85355f57794faaee90196469d80cc1ccd
+size 111550
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39d6c3b63aec0ab6ec86f39685fd39b04f82bf0a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0047ac47a9cdf237f83e663255ca64b16188969bc0291eb13c0e8aec46c423df
+size 132093
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9c19ed62665219987fd28c2e1ca2d007e7ae72ca
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa997c4d6ec08cec9a9dce93ad3772e2a7f3cccf61a8532980d9b4690eef78cd
+size 152198
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d9f6e49ca10e19827408208cde8e5e04df886956
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69a1ac7c44d77e962a743ab0f983782f4eedc3e371653115f8919fb8b52ee81c
+size 172047
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..403be450647590cd8aeae0a8b50e27cd7a7120b6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_best_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eaf563eb89ad80041ef3ad68df5b00be781d9f1d64c3346c944f8020d9e88613
+size 192008
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..65fa5280fb3a0110db4dfd646dfaf4e78fef7e85
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a051ccc96dc1c6c2508543910b81d1035724d66d63d5f2dfd818b02a2e45d95
+size 87877
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..576c73c48965c8862b90e5f2a84e9e7455217060
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37d686ce0e122ad0ba5128bec55ae41adb7e1676ff1456bd501f0d567510a8ef
+size 105061
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b832efaca1e4d5a0dc7e19c16a58d68a0c8b4ad8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75cf93af951a88590051574dd01ae3cf7e89016ec69e3eff3279a179f036219d
+size 123423
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60249cd54be90396aaaf94fde5036db54b6800f5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3705c33d5b7a4b99802c48c181a1edefc688b3374ca4ca86d37d71ecc5739880
+size 141439
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07dd0e909e9c62106696c5fac71139fbac9b87e6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ab8e82a3ec00ff9dce466519f3e1a2125dc75a7f1be20232a4a9e1f87ee8742
+size 159087
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..356f8a2ea598e1b9167f1a5617fe67f78874b0b7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_cause_effect_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53b67d19a0acba281a5de811e1b28c64c61793a7f061ffb7d970c6b895ba3791
+size 176905
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e692f4df853dbc8a5c3f83fcd3da430ecaa894fd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e023ed1eb20555d0ffd61e82c70779fe38bfc030b2d6a93d236b894f11c8e369
+size 85169
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..66fff2a7407de99c1b0b35167ec40b65eb2fc02a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf503c43f36965e10024aab3b69c440f490528d8e946adbda27c270d1a830e29
+size 101232
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..225d4833ec61741524df2f5ddbd441d13d50b20d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9182e15cc513e039698c58965635fcbe81b0b1f461430f439dc1d5d1326d67a8
+size 118483
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d93f1ca06953b8272cbbc8b6794004691cdb522
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61f104f7a2546fd0f867b22b9eca6952f27cafd3477a40bed957ab3a87a6d3c6
+size 135387
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7bd275232f2eb4aff3b6a1176b5d31793b6e0664
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9733f982561644b861250c4a7f432093986ee1619b824a5f74c4e8da9c0af02a
+size 151952
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0df7b31bba96a91a0a3cf5a67fe5be4f880c751b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_choose_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6e2e5c84f9ed6e1f775ab629d5cc6083f0fb7d7ceb20f99d22d6c5f9f53b220
+size 168745
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bad38e843c8c10ccbf6eff58590f0d60fc820f09
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbdd37d3e1f74a7012d9cd879daa6ac1e6658f679a2f8976e3d9a73f4fbdffea
+size 96748
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fa74d8d8d3422c74b149712ebe6c04ad427c3df4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19f71121b07c33874144c5332f7437111a9d21b76dcf0a944fed0969adeea117
+size 118258
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3e7eed1143a301ace657f0fc676f8da4f3aeb1a9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5b169c1f9ce2768f756614a6e74bd13ce4751c42bb30bfa943154bd33803d93
+size 140937
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5f32d0ab80a727de76b92e4f547f619e0d255bf1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feb80b1e98c6810b2a9c1f3d4a51c3e9ef401e2feca4f0702f199752fa0a3ace
+size 163225
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f3cdbb9c38e0dddccf412fe588f916fe77f6f8e4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9d176f0bbbbfa8f102a15f1755942ddb0f7de0d93c944eca441ed97ae361a9d
+size 185193
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2841688c8751b88cb87468f939ea1a7b202adb79
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f0beb829c3c98f7bd24695eb1a31f206a89cd3b36804c494392fbac3f4a7734
+size 207300
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..15f5816007a5052b0e95bdc4073ed6878d5d5b4d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:444701bad294b73ae3c6e208beaa2c087bdb855c015f031299ab6ce7a63b3fcf
+size 95791
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..304d114626f7d066a9298adff46e9da8d8721fd5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25ebb64e97bb544c65d48c65afe4feaef865c84340bbee6cd298b420f8304d1c
+size 115641
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f4d2d814e2994870c6ab9463eb0baba5ad324acd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6736c7d5c68288d81c01b9da09064fddbfdab9b1498c5c510c87e92ef2fabd44
+size 136685
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2cb44e3ce9f422f589a6eb1101735e9be2d29d93
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cc0f74ac768ebfeb671a382aa7bdd857bb56831aa35e6d486018fa5ca8138da
+size 157409
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da041583826fc744ded3834ea6be951c4eb2dc51
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64c6d2381af4422c3405a7fc4985f74bed0bd8ee137efd515a99f9508a3f015f
+size 177758
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8e1f934974df24d54cad6e60e242ac6188dbb985
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_copa_plausible_alternatives_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:355334a5ed952fd3e19d308bf3a1154fb6cd7b7e6bad52f5aa248c0a1cdc83b2
+size 198343
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..693ad11e29fd09840181bc6bff1988582e5df990
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47bbbd6714129354e58e09b6be397af0c892d8d4c9f772f4e115279d7e0bb61b
+size 3267424
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0755727975f23fc41c9eda8cb311b13e8d0b68f2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8c6a5998e7699dc43ee7e0ab9a3a050a3007462beb9d94229a8cc77000de0bd
+size 3922440
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fd7b9c645d1827dd83b8b7545f22700e5e363d3a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d3cbcb8542981105f9376341966ed66d7c7f33721dfd90b26ffb033e0d6dbd8
+size 4886622
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bf3e5529ea7b161578a1994840da9f61058aedf1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:919f7f564a99c3f0ebbe72ac5e8c15bab4108476e62f94e0e59dd8ef52b26ec6
+size 5828445
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f7362d409fe9f48ea37477b63e943cfa650a17f2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd734365c13f77d8a26e1369773e854e116989dd63cacdfbb464057883b39f08
+size 6764947
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a3cb4dde5deba290dd593f39128f83b41d195ad
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d4aaabcc4725d6a6bce04f131d09b84082365fd30aa5131dca231baf845b364
+size 7703551
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..006e2f59207c3869e9375a22bc2d29a8f6af52db
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:370d5346a6bbd381ba27c820e7ac0bd3574c745ce85163ff049b93d6454e1c59
+size 3391524
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..852e064c26a4d4d827edaa5723bb38d2fcdd9259
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eea8a0b58d93a5cd0634e0d7e49e23dd413d9bca2ed6a986fb490bd8ec8ea469
+size 3860578
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..327858e4c1ce5c3cf9f478712f7163e467d2d3eb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d3da2b2fa44296020ee30b48dae04ef5e22dcf8eac2c1b80256c4af99071af7
+size 4787178
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..850715cf4a6082028d8b38a215fe05c4cd0f5906
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:080203fc3692d0b4270f9693be665d6dd505f478363f1c0d6a90c00a479e964f
+size 5702514
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..91e87cd3887a92ce5681c21841c35e22523bfbb9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4fe28cecc81cb264d7e4120c12701f09cf754578013f22919d3fcb95c321818
+size 6611057
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..13fc663334f048882eab51f3015d7216c780ea6c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b60d63471672fc0103fdfb83c2a10a5960bb37b13ae386b2d4774e489015751
+size 7523825
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7b8f823d5f19d3a9c9b147857f8368a7b7ec4b98
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:885942d09c131c0dd431bf30e66ba6fcccab3dbaaff22711fdb115f80922e3c5
+size 3687192
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b5ad468a877de02cd7b57d9c84a90aa093691a3b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29aa246b5ae36b675844fd4afb7bdf4d2abf424e4499a2d773908c7f3d9b14e8
+size 5008997
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e2603ff74dada38096e891b15068e3e2bdcb972
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b631caf04494eb4848be628cc694544ac8f29135c044b9c30dcf47a67ae6c73d
+size 6065111
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1e0cac69b74ab139d139c7369061f31ab5d0275d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f68116f32575b88e3ebd4fb491f2854577001c67bebc9c53b121a714d0073a
+size 7102976
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a533fa2754d18b840d52a55148a950d7e9ec56f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3100ffd116257a1f6c29c02e7a0fb45968f4b39c6acc4387d6acb3bc6c5d539e
+size 8138513
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b5b2836face28886698181447a46fb85ac44392d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f4baeaddd903463a51363eeb739fbba769d0aa318585d800feacd69b2c7f6da
+size 9181448
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..61859b25647b0efc42dadbd2f2e3396e5e1676f6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1f38ca5d0b6b937994f4c014aae39ba22a8cabbf7d93ce61d6bc967b2c60dd7
+size 3619850
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39e53f8c9082115f15330733db614a3fabba9ef8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a62cc98d4e2c03bb45237e2ca4c92dba60075e461e29af4a8bdcf563a92977b1
+size 5001584
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d2791d14dfeea03b0937dcf8aaaec3b7626a6c1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8379ac5722fcb8fa1a439a45d7dea10021e5808ac1e0bfe66c5e361cd4d24a0b
+size 6100164
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d7b17bce50f1eefe50a68a4415e34007c5c6b95
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbffe91deb940820badede81de58b1669764c990a27e05bf84769345cf0eea49
+size 7186336
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..03b31ee8401dff8a84218f2197fa0000ee2f1791
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:005725286c48ce2813562f700dd4b6fb2aaaa5fc98abcc9de3351aca9cc555f6
+size 8265692
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c6248b9a4852a36f3a43642f7276b239f9a93ad
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f16f3f2fb2e8f83f7e91921bba0bd816b7457f521963260e0f97ff9c571701
+size 9348485
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bd588f91daa568d2a92f608c5c9099d4371b103a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d439ffb6401f69d0529df880b4c7e8ae680165111f7625b05df96b5f41f7aa2
+size 2810034
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..03f3b153eee12550b262e5ae07be07472aa0c552
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05a6f481460b167cf53631143206d5ba12ac93e61ab069a773a8fae9a99fbd21
+size 3448859
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d90947c6827cc22d526b1cd4c13bfeadb038709c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:486942c80f1b54390b18504b6e593175e6873c68cf2cc44442e2ff58cf3ec9ce
+size 4249499
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c2656a7283a69a16ba694b63c6681d1d94802bd6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:550ccc6eb614a5177fe510713ef0adbcc34e174a7e7cb0c7c2b4d22470acb9cc
+size 5040216
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e1bb2bbc15513d28aff048807bb6c9aea6916f38
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56bee1961bd81099c74a653cf7544e65c94d77765eba0ee822ff805aea40a259
+size 5827893
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02d2fd72aa2a05e604cb9706b5c52c72285f9727
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_e2e_nlg_cleaned_text_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea99b0f8f71681c48ce046d0e51341e30f804c5aad425b99b1e286c97c2346ad
+size 6620986
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e6b15dfa83af22b600951be7b272815ccf875381
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9281b60dc5c6f6f46d896d67cd29d40ee68a5c200bf91a7e4eb3ea41851f9e48
+size 2892960
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97b52746bcc5ecbecb607507204a9a1bdbeeefe2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70310d33545bfae65b409c7e58dba0f89017720b4dbf5e890e6c35125a29fd3e
+size 5039513
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a002ff69d81ec14e23192f5f7605d6de304231d5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a96c4ea2f9dcc78f671c94b1a56a74117b71e76f3185eb4ebeba158cb26c1a58
+size 7315386
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f8782bdb573de46e13d329c358e07da35682b6da
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58d6135475651533282cf01302eb3f569f039cf47ce79b9da0758186e1029ad0
+size 9601695
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4a477c5fe51ed9f15c98763ad196062bebec1389
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6b825cdc640f1e68c2f7b830e0a5d505879f6ea19c8236ef58d2f7fe0a93c4b
+size 11784399
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f8bb6211e98ea44857c96b87871ba2871e127e38
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e95defae7d272e942974f91e112d518ccd157332d664caecf9a6f7af4ea2188
+size 14076739
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..849eea676605f0534223ead3cf42b2167214d47b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fdfdcbf844ff2a3a8624903839f3c6bae81abb9dcf5f4ed200f1b20965ae7a4
+size 2772333
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b71adbfc19601ee27c762503e88a6224d2b456e4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07f2007ed911c722e743037c83bae994ad89d31894777d45daf698a516f41ec2
+size 4872266
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9d2d2debe2c8e856d62310863837aac9d6aabb22
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7caddc105c779e4ac10eb65a3985fa0896e1df108686c80a40474a7608f4c22a
+size 7106922
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..440e18b27ad2f2b3d8da931cf5332aa7b6e989b7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42f5f5ff7aae574c87af8bbeab37d0a3421dc70eff7b5b837b86eb487080d736
+size 9370987
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5dffe711f8e1c85393a53bca3e9c9662ab3219d7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5728820da22ff080ba578245d1ddbede09ae2a0551b4cc08355b2ecf7e633345
+size 11532092
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f609471e08306cef4b34840084328f54bf63a0d8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_DOC_tldr_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34acfbd719fc1e3e5ab686202287ffa66d0150d43f7081d0d728001b98075cb6
+size 13793401
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5358b90cfd60edb55a2754d6522c0a6eae14fe0a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fbecb86d49a149fe31bdf01989a3b9ba3ef5cf2f959aae95b9d0e4223e86c93
+size 2797162
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bd589a4fdafe7c0dca9de4ef9676c04bc0d8a1b9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db586ec8aea366f0928631faba4da9644b255dfd1f9a5c1818409b53529d63be
+size 4950413
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..500a8f763b8ebcb91eb99b67ea0f5547436bd9a1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99393205ca1c09f6d885e33ae1f433ca45819e6e7eb89423ed88e0bde5a090f8
+size 7201692
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8823db6fabce4efc626552bae6636d4108ded72f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79e40bddea6c6f945c3c41c468ef90a763a064d591a45cab4854fa232f02d235
+size 9470578
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97c5a5cd25cc794d1d0176eaa56d937068996dab
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:198ed5be6663b5ee3134bfb0735aa1073dc72d6ff76dea32099b6432be19e985
+size 11630406
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0aaee5fe029787cee5ee904b4410f5c9a2883942
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65d4d0227bd4d26f3e9a99d734f3d030328c20d626c6f922920b3dc155977f38
+size 13896009
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2915b6a64a6cde57790522bb7b73bd95af5c49b6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4941d9e72fd5e7132288abca0c7f199f3f2bf073203dc37ed62c674cee16962
+size 2832727
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a88da3634fcc1ccdad784b40e778615b71d4893c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:715dd45a2923c900bcd0328bb915d1480bc40754b1b982dfe902a4cb37f3ff1a
+size 4988991
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b09b31ce206eec0c979da3c28bdfa12d9a31cda0
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a59d0d7e223d884431837955f05fad7c03ecc41b169094969abe2e0d8e1865de
+size 7202554
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dc8afec90e1880f4582606a85d9cff68ca26fbcf
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e94392388b34b850a7c1e1d3db8fa2a683ba5fd44ffdd6b07f3623b0865dadd3
+size 9466294
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..43c2ec112235d0c728b089cafb92ac2806549db5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a71600694162a3985a0ef6f15c22a1904299f2e27343801ff4377e03f7fbf6e3
+size 11616949
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64b75da0bb5fd0eca5c0262409e609935cea1963
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_DOC_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73f68cd70e39fc4203efb8924b66ede3d7e9e228d4751b3f4340e363c792cac2
+size 13882337
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..03a1310702233e356ee4befec96cf745751bdcfb
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d27ed999d7a4a394e789f1583a3fa6b06fae27949db0cffc4d3016a1f711d12
+size 2871654
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2cd3b36a3f684947a3d337192726d9ae1b53cb3e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:858a35d7e6617243b3548ed4e496f00c24ea583ab4378521b776c233a765ce36
+size 5004540
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a18e70123ef5ea00f245c6d32e76baf5ff3fa5f4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b33a85167b9ee8ad0df60bb72c299acefbf14899f88ad2af6b82663a916f2157
+size 7269102
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..df54fce6c99d9c322d5d43944c271471a6161e4e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fa014cd3e76319ca5c56cfec05bba8a911b1cf8d9c988d9cb49d6160d5de860
+size 9555602
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..15b7aedac43d3e5ccbe5db64c9b2a1e52d88452c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d1d9afddd08854bf818259e4596ac2f311c851c461b1111ee7aee4c15c9895c
+size 11731127
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7c679d9b2108321f196469bd440dd3ab120345c1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d06650b16467671886543128aa88ddb210ea9c5810d04f4d84bfe45cd9fb4eb3
+size 14018439
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6b932fd5d8f88660a46ad682e5fb90e345611ed5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b99dbbc1c0bbcf89d28b81401c4f6dfa6393cd880ca7190964baf370cb930b83
+size 3625331
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbd0e32896c246d84f82970b5f5fb9c2901a38e6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fa92fef8d29f808a18698d4400438479dd0888bb0467a980a8e6f46f4998188
+size 2603627
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9584bcd5a517d3349b83cacbd5a3d3f7c990b801
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3375957301e2e885fcdaff58c3601560c12384546d62031b75a569697ed735f8
+size 3282774
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3837527708cf25e2237ba77c69f79eb138a2f301
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa89d2ad7b0de33129b7c133ad2d7c07c18f5a871e6c4fb55155dee429685eba
+size 3956534
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cb60aa89bb7a6d58f051747d2a7cb05acfeb6d04
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47c41c6c21bc01d8ecdf73d5cbe4ebf8d5bea76da52f1b6e0aa02b016ad31b68
+size 4648122
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0069049d4f332de7cd7eca879b4d2454416bb4c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_Correct-the-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b0e7f8ac57b87e10b2016852101d9fcf6f258241ef881b0ecd0de24a391cadb
+size 5332611
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e99f0a94cf7194d9fa130192d77f9b9053d21cce
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba4d749cb6f1fd33a212e0d31b7e18f3e7ac6e44d37d636e641cf4f852372521
+size 2115935
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fc1c02e93b3978ea171b9cd176bf5699b5c8ea29
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb19050b9eee1854c81f96c0b85050c03fbd7545409f029101baab0d71909551
+size 2900909
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..76ea5f5040d168454c03ccb72793d049fd0f935d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e79d8d04dc87ba6715764b3b11c5947e2cfe99676ca40068f24c84f884ed75c4
+size 3683911
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fc73caa51bf0a68e8545f01453e9395caa5ce7f7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3eeac072129eb16c8f9ec8e72335e72f13d4756a8988c851fcc6af5149dd64d
+size 4458372
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23e3ccf04aa00fff1c61478f890b4c5ad4e83d66
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98cf98b3e871ae6485ce0cc01791bcb4f6d4f05ed2b96166c888b7dc4a92ce59
+size 5254516
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a691fcae04259f9389a2599fc6130dee0ad54cda
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66f716fa9d1c2f16d8e89ced2bbf6003963ec9d1a44864c094dbcdb3e7806b93
+size 6040677
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb44ebf13cf29b0338b85005a968918a1a8b2cf7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7926981fd77e5d4271b63f07b1fcf12c043a66166829b1bac264afbabfca6d0e
+size 3017790
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..38b0f1baebe3a33356a400077f4ee980e271f71e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68fe5fe6839034c468f25af52a7c124c9756fe4e4d9e395083fd5a4de01c06dd
+size 2485392
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..34c33c8e93ffb67bc81cefc40dc565d5298dbbfa
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a87d1bd6e77ba6df7971774bd74ee19e6d94d9f093d6c11bfbeb1aa96e22681b
+size 1909474
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b53fb4c11f16c55ade51cf83ca3672c3ac3b37c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a92c6c3b3a0f3f0bbd820627eae442d4fbf37422e1d4634be2b40bcf3db05ed
+size 2107118
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4244ef5b03bc2d1ef1de162c0f34044fb916d3c7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb9e436240623da84efeb5f9f5f497a5ca1c87412588faf25855304b3c40a117
+size 2387935
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e6a7da7f48b8a1373146fd9c449b11bc023aaa6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_no-prompt-needed_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3685669e9d2512d62f73fbb0067845f06d8fc479d64a2eed5d5dce84c9e40cf0
+size 2674699
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..19b10cce9e321f2a15e2cb02d512cf92f80f7096
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e279b4b3583932934dad3eaa5314f3944d29511c59f3fa0e1ad84fdfbe922906
+size 1864129
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a1648486e062e2942f316e24b60ddde0c48f8593
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24a1fd8e5593a01891a30508e488434eda6edfd8140cd6fbe4313be9b4b266ab
+size 2557203
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6af9a00f48562199c39ff54ec3bed95516a060e6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1cdf694b88c43d1e5a99652dcf1ee8a2316462ca474d0a4dd619a8bf2b7c46f
+size 3248305
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ee5de4b8707bf6674ab95ef628905a04b2403d39
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8936956671ccf3852299bdd6ebab9c5d7d02fd0b0ab5fa47dba638ed8aad2ae
+size 3930866
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dce0ad3550b6613c3c3f95f50e101bf097b8b771
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773b523d24622cd036ccaef67707c791eb5c58af66b1ec70b9aad2ca3f698e91
+size 4635110
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d23a7e06df3c89fe4b6d08f892c81a024a5c80a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_pick_correct_choice_index_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28672975c24f4728d0e7e86314907baf1c3617925eb3f343a7369e58191b8a17
+size 5329371
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..65092258da4e2c266dbcb0456933c79e889d3b98
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edc2fb74c41bc91e3b0f64b3c320078764889b26e5203a59d2e6959883630305
+size 2256692
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a61e83a05f2580d491eea9486e1f8ba2fe9ff49
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38d6a74a2541094cb1586b3472703cc18df3cebbe4a59934f6e437755215c8e6
+size 3017463
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97563c751b2c6bbbe86fb8667da6fb2cc20d5089
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:561be11f3fe10494f28e08a2ebcc4c5c293c0d3dcecd4b414bb60c5a2f5ea8ee
+size 3774068
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..936ccfe38a3f7862cb5d75e813ce23b2666e3fb7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c139bc1a087933b996d7e6ab760714c10bff006c2f5ab6b3f01d4de60341407e
+size 4518793
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a477f63bc04554ea632534663726900cc1dff594
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f10cf08fd5d4dd24f5ef6ce94c7e8e5c237d36ca54df7aa5f50df1866ee2790
+size 5293709
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9aa4e6b943843e93c103bc619d9595a8dbebe7f4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_piqa_what_is_the_correct_ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b31ae82f732130852d53241ed511ca124897bc24bbe0ebd9541c51478dc2ced
+size 6055135
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9366644b85d3d07eec8e281d4e22b7a7d8667951
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a76116026de665186e0e86b598d6b87849c4f590f67de7e3a490e33593ab912
+size 639908
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3339395a4aa72cf94d3a8725d5a643ade5993d7f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76d068caf6b403f61a650c5a3eff76db619cc7dcc025bbe4a03c3e06cb9e7164
+size 754991
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b4476a143511d9cffe339ebd2b1b419153fe14a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14b81a970968a62e32c670d9c7793c627e315468b67538f340ce4518369e8ae4
+size 871235
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba166d290c5e7f302e275e2a8a6703680bab6d3c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa6d989a043b1034d31e548d1fdb761dd122e1f677086ec036b3cb8b70c6f709
+size 985616
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b7d5f45544abda04367c1137ae1aafb8edb8f6b6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99b501c79ae7a3c40f3b31a109a3febdaa2167f01134784f17cb315c99ee98ac
+size 1098333
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7df24a2c2972b581433dcbccfd5f2d923c4e4e73
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:394a8e8b3a70d3365e795a06bea8bce93032dcf24d7c256dac0d48dcf1e67abb
+size 1213535
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5f977e22a209dab46d4dca3156966d0b1c0fe602
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a73e8a73cf0512ef0a8e88d62779b583f3eb36667dae35db05fcefac61a8c087
+size 1182480
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ead5c4eeb69285ba6758481d9043b2ce5ba04d04
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43f54b0e060207a24390b6b893f8e117fcec33993e5a42607dbe440dc538950d
+size 1779277
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fc7a479dbdfa701522cbd6aa6b340265a4526074
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb004433cf4311fa2e33602d581271c597f9684f9315617ebe7ffc63f8bba3ef
+size 2388520
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..918c7c6b1a3dddb40178f89e3255a65b165dce40
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c18145f81e678ff92bcc846872ed84cfdf98f38a1c63beb95874a7ed39fc1cc4
+size 2973487
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d43d3a0b5c7c0dc40857ca4804257621d2c9e846
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d11b3e0979e1cea97d879cad4be27bae09aca7ce510c932bbe8557e9e49e6131
+size 3555835
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71bc4882cf9227a8eed6e0f62b72e828413a9e6c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Direct-Question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:587f9d318478f4c7b323e2df70f558d96255af118fa35a096d81c1d79f623e95
+size 4144660
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9a4c34151398d352c8a11c4f9bcb60e6b86a6e70
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb49322e329f4e53015185fa8f9d0e6dbf7e90d566d61255868afddb9eca0adc
+size 1328989
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f4e42db9b69ac5467d87f8cdfb4c80030f450f83
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df141ecf826eea8f2512a97dd75fe3af0106af612652b407c925eb028ab84af4
+size 1524734
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f385cd5f50b46c3b05b4a6cd32734d0b20481565
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0bc191c99a9eafdfb82b2fc1af8086ac2b95eab65ceb218612953e69fa09e0c
+size 1720808
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bd4eba80447440702e71056be6fccc98a0cd60c3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9fd3239fff490ab22938bf2cfbc02cc727df0ad642cca8815f1299a3da367eb
+size 1915708
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4065f2d2a3c57f01f5f415c595a7bb0ef2e6eee8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c94f141570cf7257e10f818d95a1f782b37ffe13c8b7d09da58df25faedae93
+size 2107444
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5441c84a516635659a23708bd922e8bb6f847a14
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c94d36d73322543dbedaad33bc6813c5733faf341d642295b711920934f2b0ed
+size 2301018
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..373e1e38852ad4c653fae6030b07b13debb817f3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:445bd495abd178f2cee7c464093de000ea514f8a96cbf2c0ce11207487247800
+size 1935156
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..888b4ba11ad26a6c0e243b1b38171ec7d29b57b7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20e482cfc56569c76b5fdfb6ab22aa6abb12c7d3c6ee8545d562ade71a94fe78
+size 2636592
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e4bbe11d8f561a880a42241be7780a706e2f29c9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:527fed7488d710855dafddee1feec977f174fd586ebdfc6b46f60ef06b465171
+size 3349623
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..affb1711b66be7f196843df82accebe9f378abad
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33174b7d0f56499b7bd6c832c168e9f005d8291f900c8088ad6e5800f8a40666
+size 4039069
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ec95ab71f99ed249b9efefa7b91ecdcf20a39b4c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:681b73206893f0647a82c895b3e9d33a023efc8c860ef0e988297f972ba09244
+size 4724463
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..980c2b590049c59d6b56b67bef9874ed01e92d8f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a61563a1461b505668957fcfbc91f060a8ea3c89aa5b63e0720d7e61ee94103
+size 5415504
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f98cbec59ac912507040f9e581c337df9450665
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c0bb7fa1237c8f82b2acacd8df54f3b0d02c03649ac0697fd647e0942bcb071
+size 1870084
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..18c0ecdc1cbdb1e471c5b842b9d09d414171123e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a46058194a5c0eb26625972906cfb4e5034f3bbf2d0b2ee7c67a6d2da869b0cf
+size 2545535
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c3cd9e00ef462fe3f7597579fcb7ed374ab55c9a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:229eb2db2f5c84ae72c887134b8eb9833d6597eb8765cad95cc0c0795d576f69
+size 3232567
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e9e60dc5986cd6905490078c47ac7a8c553fe998
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59ec6324da5c0f0f822dd1c2a1d3d4d5d6812ad42839aa12a1c0d2500ec9ad8b
+size 3896120
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7fe0deb52fc504de0e007deaeb2d49ba2a7c61cd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1444f205f73b50ca736e15a7747787733d7999ccbd370f8cdca360cfbe9d5922
+size 4555398
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..008d75939666fa7ec48ec80253b6a585c7173659
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_sciq_Multiple-Choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3e45a9c5e6c91ad3fdc639f038da7cf1587dfd7dd3ef23ce2bc7497cdb0d83a
+size 5220545
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..587205c405d8151396605f8ef804f5d30c87499c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f67b4088af8f2f02c6672d042484e3b57d1990c5c3aefa7ea13efdfbe13c5bc2
+size 2213758
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..082877b1f98a5a750fd41b97c94303297a1cfa71
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a86d06d6dcda41d9a8e21ad3de95ec5f5cbc7dc8bed6e188ecc18e90791dfab3
+size 2974555
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a854851048fbe4a9ef00400bf9b329ddc89c0649
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42da866f7b029300830c6f4b1846035e588ec197e57aca2ad6603cafe568b9ff
+size 3726785
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06268d1b9115877301b6dbae9ddb8cf39273c791
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2225c9d22942d5ab2107ee19ffea19e8d0c5703a0bb27da17dc8ef269b2664c
+size 4478465
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..933c616603b6579224ea2741988d944fd7083b2b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:167177b2e22f00dd018a42192d38850286db4ce9997974ecd89b07e15bb382f5
+size 5229802
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d87f8dc2c7d4665ee8088c42e0562b61284ec058
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:928d444767cc207b2664edc3dc76c5a3690d08cc68361656e0560472dece6943
+size 5980500
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ac27ccaebcba30c0325502816cdf28e3c035d454
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b66e37a26b1cd15882b153795fa1652e6281fb9c4c545635b4699e6e0b6f050
+size 2361630
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7b850e92fb4a23a2fd9bb10d3adcddb5c0e34884
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97f5e1afc6eef8c6fb3f9a65ca1b4786b3282a953326ffcf4beaa5cdfab9ffc0
+size 3193955
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d58d0db7e57b57d444a2ebdf3aa76fd82fc38708
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:578f84fac9b0985ecc97c473073c582c48ae4f71eb61ee2b3dd54bb6cb03fa0c
+size 4016568
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f54a878f5a9df507f27f613981bffce4771de1d5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a8010c2ec764f45422894183f5c0ac1f9456db83e825cdd00334dc68f72ee26
+size 4838889
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11f4ef0038d9c4019b540efa6f559b83168670e1
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe2643fdaee6bb9adff357d2ba89607da03cfc24f6c00f6eab6f036d9d66a9bb
+size 5662050
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ebefc07d79e5e1db21c8cb4d0eda90ab94566507
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9b2ca2f37686ded05492e5e36e880531360f33cb1cf3144cbb5293693187270
+size 6483733
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ac05f6ffcfd2d32f338151155878bb308e35fafd
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:710cd6397f6db7507b9bfe1944749894ac7aa8862f37d966f86a8435f7814663
+size 1879091
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..87dd093075eead8f8d8c3310a138b9e0095c05c5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb0e55f6334d9423de484f6c5b037438d5e7acbd09c055e136226d7a2be250c5
+size 2434896
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a0634b3c7f279d687147c364a711348637df5809
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2abb032a6674b198a93dc90e7810573db095370531b0506a6e3b33f8984fb976
+size 2985235
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4ad72c16dccb2ea0bf940c6aebc7344befcb1be2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:827c1ba96e062abdc5ed56b63a61b6b44a1aa17b310d59e40dfedcbc85a25b2c
+size 3533867
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..54679d048f92cc01b11f24047b4ec12f9b9cfe55
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85db1f4b19dfacf4bc3bfdd4e36ec1575000454ab6f76d8320fa3a08bb57c76e
+size 4082209
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba25bef7733cb048f4efba3432c488b2527af76b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Generate-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a8ce47a793afabe1770d42a7337e612e7f1dd5ab9f7933cc096de5a271b3366
+size 4629323
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a573f366f2b940df025bc13a0d728db5d5030023
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90c94d132cfd29d4b054ebc27c25f1125e9a3add010d9589b13a03af7c0338f7
+size 2371771
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bfd6e1ff83fe2ab101ec597b58649e68fe35df26
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a447783f8ba36e99e78e4ba064d744ff60c7bfe80f37dcbca236389d5908e6cd
+size 3210595
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ce1bcde00a6b72b25faa2b9659c977fe6b252dd5
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d63962d3a2c69e452518d723e81dd0f2def6fa92a1d091d9563ef6ff825df85a
+size 4041168
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c631b792c734ab803b6f86d2c5652a3ece9f6539
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:444176d092fd5afb21d9f8322d397aa6aff349c78be3a1ae64eea01df6ede951
+size 4871440
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..927e7f1ee24fff4455d7f0a460c45a4f084a14f4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:543d10ae5d7e322d9ea2c0f7f1be4212d9f2a4798f68073bd11187a01d958eb9
+size 5701558
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c3cf731ae5261958af6cdf7aedf606e74071de29
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4f3b473e994107ffd027f9277d48e1d7df3eeff4e12e4fe99115580a49d437f
+size 6530941
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64dc3e6ddd342b9c2f722a43fdda474dade81a8d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4bcc17edcd7d514deccf0c38dbb532f183e79d399f5df3eccb45e6234c42cf0
+size 2344657
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..59e0ac5eb8db571a1f83e77d4e06cbbe9753ceca
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e433055c0404bffa13ef85f4f77542bf372ce10904f0f84da57a680509e5f01f
+size 3158321
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b8594e555d3f9a18e3318296e65a0f27a23b7332
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4105e88df3e4416cef2c5c4f49625cb8bf2a506d6aa1ae9a73016bee4af7c46
+size 3962549
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..981430d33f758168560e022743a9f3adda4c5d33
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a7124ca358bde74dd16f4de1d940a4cc125071ff60764d72f1cf88bc3359e7
+size 4766053
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bb18ead393538a8d867c7a78ccf10cc988b22f56
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ba3b5de58c0252f996607ef90e29342ba08032a156a01818b8467b54133d544
+size 5570302
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..89348f9001ed99a7bbc2da166a20d5176ff450ea
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d258134620863febe8916022e57de9a77ebefcb9380385259f8e9595cec34e74
+size 6373228
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1a3b0e1f5653c04031fde5c4bf0bd8635786723
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21611cff03ec5cc1a7b86bc837f9f9994149433bfbaed3a0ff09a2565ef4c852
+size 250576
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c18991c637240d7c1f6e9b33fdceb375ed5dba5f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30984712fb5bc981951066ccf710bf424df35a68e6a279aa4b179f811069caf0
+size 351347
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f17c3392a522c50aea13648817bbffa90dd787e7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55ac820608f2eb3604436ca2f03cf3d49e1ae86c3c79bba82a72bd1791f51465
+size 449681
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23c85e45e76fbba1998c3274521fe01ecca9bdde
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6920cf251b161edfeec100b461e68197312d61d2ba0aa55513abfc63fbc44ea6
+size 553057
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b852d6f2925a5037b4af1a27891d6af141fdd3d3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:835373b7d454c4450cfff46b8559feb8eab032553abbff6da56da01b0b530725
+size 651690
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e1ca54abd1714f01de9147f5730e316c3fe53a1b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31eac34c59bae5f100088b8b36f26de5655a281cc4b5d5c64530fdfd5a4685be
+size 747602
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d511aa36201e093bcce29e800c5b1f770caa7554
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:381dfb7992e61072753f7568b0357a7d431f5f9c4e8461298672dd7f56f32a44
+size 293154
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346
+size 415394
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..52660de3d9cbf4dfc3f3fceac319a7fdf1e0bfff
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c8760dec5822c4385d909586baab6bdbb4fb8b9bd7fc6844aac9158a5f1d8cb
+size 535369
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d920d2bb8400e168ccd8e3afd0e3c29316a4348
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:633e6538ef7e5a7ef605d07d7f64a3d14de5e282fc89a703e8bd5e0267104d5e
+size 660354
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b21e1019c8ca0065626f7405b5b18c6b143c07c
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cc7b8c0a9ccdfb3495e36af000e831c82f32961af097de7b868014cc66e0adc
+size 780573
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8c88716207ae456abc4c7081ffed65aed91287a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:885b90b129f88c7bdd92478470c34272bdf3b927ea680b8c05261fb4c82351a6
+size 898081
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6509212e45c21f69dc7414e0bf3cda8b8c1fc16f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc60771e2afab8c4c4978400db2bd4c365b2dc6207332e8fdc9041e5b3e148c1
+size 258521
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0af9dad7842af4fa119c8038ce1950ff6dc82d6f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b76fe04550168be6e15aa626da5662d75db1def9e3817cd07b532c52f8f85e95
+size 363044
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8fb5ba11770c98a62a73d93938d5f4b505c2cb5d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ea986bdd089010b1708d1ed01687d3aab658ef5b9fb49188c2d446f62435fa0
+size 465284
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8cc92e4c4ba878464621a521f6df0e1fa3f911d6
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5e55146ee47be47237d1930f0c5eb1dd67cbada32083f3d27455f93589c6eb2
+size 572551
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6f40bd10f0aebebe55a12d454c3811df39b0125
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a1456220d5f792f330d114249e056b5ad283200aeeecd9ddde4222c778360d6
+size 675046
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71472832717c3e49ed5d18dbfc449165248b5119
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_does-it-follow-that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:477af9422b1ca231808caf236d814dbd9d62feb987f49a3b583df79923af5651
+size 774831
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..276d2bb14ad81cbbeff649dd30c216632ab448d9
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ec41b5c51961c73a2143e58f2f2807f5a7d2fdbf8a7a8152c6c94beac9bb08c
+size 261293
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d
+size 367750
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1c02cb3e00996eabc833807c7cf47074327545a3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e56e7e34454ad6152e5e7d4a85faf8958e1686cfafe6f8e34581e92344d62bf0
+size 471934
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7977ce0ac9421f9bb9ff879520b510b730668d98
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c755797d2d3e328a29fed551c6af0657c04d95ccf370e8f17a6cd967727a3f45
+size 581131
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ff0b52345cf1ac7bd3556e7f524b2df6f16cce95
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e829a466c347a2895aceb32fc555a05f2fc4ce4ff1427224d6e235b48439bfd4
+size 685561
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0b7fc501a6835d4ccdff21cca511e10b5ee71e94
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_guaranteed-true_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28421fbaaf9933652029449159eb87f934f71b33ea8276dc57230e3c977829d0
+size 787285
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c56b25746e330d89b622647775309d3b4c91396
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1adb3e9ccc828d67ebba819618ae04bc5e7536ed87e8be6d9d355288849c552
+size 262404
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f
+size 369689
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..86901c7e8d17e657f481d5f9f37a7eab48a4fa88
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fae73e8a62a0b5cba04e700b5c0ddbfb54ced245ee2d5f721e04d8ce1f55772
+size 474706
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e407884ade672069a4a6b2ba37f19a4e1532d58
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a9f3495ae400c8199fd2f44eddc379c464b1fe96b66190651bc6db2b0257c0
+size 584747
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e663b756a01596bc34584897ebc9d0855a7eb46
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e121e9581c063a7ed7df2a408800b360145356eb130426734f2dcfe2860d68a
+size 690005
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..840b1f22103f9f365d4545124d432f23ccb27b2d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_superglue_rte_should-assume_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17607f30d7379859392ed4247ff6c86ccc1458befd410caecff53faf439aa183
+size 792563
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da2c0fe72a2099efad7e8842f7c8f000fc754600
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1117077ef7ca6c82e1cc60e44545e595d52b4bc0714373dcdceae10aa638d503
+size 1039172
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de3897cb73ddb9bfff16441aa691ee3a133c5d49
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b39abfd3c1fcb7d579551898bb4cab36f34024887b50ace5c7717a0420726f7d
+size 1300104
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9626169a33ae234ee22564dc4828a4c8dea10589
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f12b1db39041bec341ae49091852316088e99f88ce2d8e61f68423469ba47cb0
+size 1561285
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c1b541a2b524e7149e8719ab41adc7396557d185
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb7162d822ae0b354a6a71af001b372e8531063f9f1fedfa6d9acf5dfe4d2c21
+size 1822773
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c605486ae6d518c6769f230ba9d96d7ccc83c672
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cef071c22f54c51a4443aa5b9b7b27e526a7fb30b4cc054dab9c7ce7dae2ed35
+size 2083075
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c5621c2157c223e135f1d50d044afe6961e092ca
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_Replace_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15a11816bbfbb5de7681cb0f05823297e132054c2c10250f2498add58601b1a3
+size 2343777
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e245f293005aad140bb18e66c2ed064098d75a7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38a93b006d8627048357bf8f028bcccae590a0c17ec1e69108b542f48880ed6
+size 948111
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0649e1710cb45b987f718ba2bb5088be3f0b9a8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e75fb8d0b696847fdebcb6751c231101c3bc1b999ff826c1b5d0ec8eab3226fa
+size 1180438
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb0b21be84e0f09acead6a8065080e26542089fe
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8d7c70f252a45a899386f9b7e3fbf46346d885c6c6564749c76f259965269bc
+size 1413052
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ec30730bcc80310c3ce803add80dcd15349b1c0e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df52e1b214be345c8d47d6c74202f2bee6bcc3af4de6a374ec97576803e38f62
+size 1645949
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8820e48885e81822c8396c56e3adeda030149cdf
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42c5c69323d337312a2f0e94a2697e493f91e4297e8b178663b30990a85e3664
+size 1877836
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02e40dee9ff66977b5b625427860b3ab7f3afa73
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_True-or-False_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:548d7e7ed82f8b0f7449f771083fad52885e044f6fb78c4c2e3328149184bc1b
+size 2109934
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da173e2b40e16b4e4b9801589ed7d08fa3859fb8
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f4dc36dfe1e096e9165306bb9d1a840850b8c279b375cd2978e2956fc32746e
+size 1009901
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eceb47887c8935d5a534e42e02f78a22f0d965e7
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2acba451bc11c8e6ed2b1048114824c62e7c0d6f22c42f625e0c50fd2d878390
+size 1243153
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..faa11a4082028acb62b6bc51aaff0ba72e8ef33d
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d040561c6ec06398671bd33d63694766f914d6b072938268d0ff632add7ac357
+size 1476478
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a1429ce05743ca60bf715b640ef42ac733ac388a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a86419db8d78c7dd6e4d6b75180d9181e68f5e399d12fe22cd90c0a45cb7f246
+size 1710088
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0b7d39ceb5c238cf6211f8d0488f6e29f855101e
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:790a3a48cc9c39d890be3181df4c8d2c885c19dfa0209bc246b3b3b4841114ab
+size 1942449
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..029b8f922962123897e0a78a1a14b26b9903de26
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_does-underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e31c8654b20f9b1637dc30b3b4b37cbb4b06aee581d8ef1cb7f55dfeb8cd54ae
+size 2175338
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae741bf4adfaa55d0227b9311b2c64ea40793fd4
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dda1dd6c48dee008502b73c69ada42d514d7e272c919f81d2ec47267f3e539ac
+size 969448
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d426fb44e80acb65676e6459c96bd1faec876104
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fce93abf9c854305f088fecb3eac837284b9f9d9fabd1db01ce5b9b97a21ce86
+size 1205095
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d9c81749e198f7eda276d4204fc40ddb30bf1d5b
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98583723f994c4945d38a649ff4a79d7b82f374326bd7138e148f1c904d3f5d3
+size 1440990
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a321fd1466cea5a0f73700d4ba662662a6d87ba
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9c6eccb650cd182fe4214770d7bd5b824dd863f0814823c354937c38f7345d0
+size 1677070
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dea3d4d501ba56d8c55e76167f5aa25c28538fc2
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:413b8e976c9e1da640b7804792596574b3ae87783ac9d4c963f7f64c96023d88
+size 1912013
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d5d764e8b1a6eece402e77b0278849146462e34
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_stand-for_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b04e4f77f60dadae9a5db58659c13be67ea9b037b76a8c3b3cba14cd9f2a7ce6
+size 2147435
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_0.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8fdbc9dd6a965c568f687d259aef1dc9fe05de73
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34ef857a9bc70bc4ec5012096e2c2c72c3bc9fc782e087bd35508df86710c007
+size 1016237
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_1.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..37adedde6233cd11d53bcee9666610ec9345c8ac
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6af44240b4fc5b2a7736eb6fd3bffa02e56dadb134e8364c0548f40d17383e2
+size 1257131
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_2.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..61131efc22079eebee688b94c1b34b252e5b7832
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5148292f2704482effea1559f70656e2acd947ffdfa9fec76ad719669b9d891
+size 1498026
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_3.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45bbeb20c39bfea39b249150d4e0e2828e7cc60a
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4289c26785f98bc1c9b8cd95a32229837cf078159096541cf40a04d000bacb4b
+size 1739214
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_4.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..28397a4f3592fc2050d3f929ee52973adced8caa
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88ba9d102a3eb303436de0f7fb1b471c9116177082a235dac6e7b4148655953b
+size 1979256
diff --git a/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_5.jsonl b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..81be7364bcb5a034e4e7613a6f370ec1f6f8d7e3
--- /dev/null
+++ b/4b284b28boscar/eval/examples.4b284b28boscar_winogrande_underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29e1738bc8d7d7578c1d5bfdfcef2652792f8fd9eaf9ce73424c67b9633db590
+size 2219653
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d43ceb48c22837c2e284dc88e8dec4e95f34ce01
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.3236606671689831,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03434728960928616
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.07548519902353701,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0027103393867684133
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.27434436630785664,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005190735227370315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.09843971445320344,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020285748309311217
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.03280247893554901,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015645885841163746
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1283696309182241,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0032411037301123703
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.044864078248862495,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012327331599678007
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.07235248106657086,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002585891559293976
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.26594645785931426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.005067281777094968
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.09452977794835304,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019058829105986163
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.07204958395594553,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00260168307386312
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.2598611687840442,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004868835623139477
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.09372697520153284,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019257190223106488
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..57f7873969d5bbcecc7bf347f8c162f6a08382b7
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5303952344866372,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03866628930999967
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.13500481273386208,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004456041975528417
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3050209335340558,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005019779051030469
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.15526776951779941,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0037112903723659145
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.06800836209907518,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029183003453498208
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.15616590508602413,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0035788033957169676
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07803113267822599,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0024911371039099254
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12100254673151754,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0038758351195453834
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.28545106347976323,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004617233759164798
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1409133691299396,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003170026813140542
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.12390505105203532,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003973176754882277
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.28858761694209856,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004669345842896859
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.14369743370526267,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003262924473269048
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd9f16be65d3d79712118f38fd20ea9640cce8f3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.6963169812480917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.025883064453583102
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1792256974533611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005365719346439527
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.34488184545131717,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004970214155406786
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.19513258665465127,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0043495841141288575
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.0957315113405197,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0035283155722927633
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1834475236414602,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037698051706010482
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10282617536971313,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002972081492632468
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.15875527177897067,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004689203205430283
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.31884399599505703,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004581869680066843
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1746470290433306,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003733649528603583
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16348748083622414,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004839999383468051
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3238241924312652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0046243997645253
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.17894683258352004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003839954827867676
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb582b2957bef96e25a0412544a6a0b9933673c9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.8274667802488902,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.042366964185806814
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.18955004198888442,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005662094291901046
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3533218112336572,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004960841139206212
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.20403726404125458,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004566219372488845
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10477428578899617,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003935236903263568
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1901585058979038,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0038815093874070626
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11040571352169745,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0032753908389439508
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.16706159999567868,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004913635515279735
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.32629206613585876,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004527254280455867
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18223468887125507,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003924418368935964
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17216966052247287,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005091094356122184
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3313273065171706,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045736991609824015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18670844563664446,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0040383704320243125
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..45545788005ff8a2bff860a33886eb87d334a5ae
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9591911553106375,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05993639067207907
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1924412284693008,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005450604205447632
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3705586639190707,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005109363173845363
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2120064135279467,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004550653208173271
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10648010653830348,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00377122887055479
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.2022148169333304,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003998276489277708
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11494130887956767,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0032162324276510928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.16950864458355333,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004740831041491813
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34083637585044774,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004631727723114902
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18886918285090654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038962288174863224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17511108437715225,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004905333361798361
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.34672946562667306,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00468590709544727
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1940229701192449,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00401611332459292
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c9857e6073bb2f0fd5ceb447e82566aa41e8748
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0660500946745477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.057054038562455854
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.2143907231679788,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00587683477313044
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.37935769424464766,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005114542762979336
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.22541411583738258,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0046964527830320015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11999776807861791,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0040878065570314984
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.20846703611506925,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004052332137679142
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.12304296607701624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0033297218439898735
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1874233346433946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0050850214855498194
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3480461100730932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004676777395335985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19952305726552913,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004002247434870572
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.19460128268928695,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005298127155329163
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3552748425353144,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004735986385918885
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.2058454997784591,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004150385211942694
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4882da8f1bf0eacb16aae4a77c7f7daad0203a0f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.0346205955689389,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0005387702896022102
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.2609393105105274,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002297796887792241
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.05919431140554899,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008128532722581225
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.0016947079558272144,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00013522786800163755
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.015091208295292367,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010023028590081659
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.002924238155455059,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0002204715259212068
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.03444508009180932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0005126959545386834
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.2604556924140456,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002268199224560253
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.05894324375481446,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007794281990642839
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.02388328793589735,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00037394497845618594
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.19000592468238123,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017654293062185674
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.04104954287102164,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0005608202683646394
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 0.0446593937662798,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.01212322535936079
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb19cf17f77508341bc9d6f010d0029c5717e603
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.5057362328292612,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006433371408967441
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.3989876838504679,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004870545370765362
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.3936404222340052,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004489407812168967
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.26705886387883326,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005229555960716035
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.20484037138370584,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004083553885711203
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.20245301477178956,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038618279740674083
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.42025595395161747,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0057680756258974
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.3333446990971047,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004337074043374792
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.3258665538462602,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003965783684207691
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.4457444624630892,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005979013232422238
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.350619501110215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004439950958674579
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.3445604185348635,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004072911701143514
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 7.705805240318873,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.39271445746759187
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba38fed14192e83e5778de21ebe9e4feafbe37b2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.630988373148268,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005562228139019159
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4799246477199157,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004778694197234316
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.500240361458831,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004172644527560273
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.37181396597783567,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005112789501488624
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.27856393096329507,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004222748696771387
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.28897489966738155,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0039448683747204324
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5255685123033286,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0052326684310782185
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.40098528853058324,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004349425470384873
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.41542346879767855,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038486111843277224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5562724380069954,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005355340858219827
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4198502948499723,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004340065004215543
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4374377646795759,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038445750973011483
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 13.13450054463073,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.24192164477015318
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6e0f3fec36e894c2d290b44b2e6c8c9218d08f6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6493064131306431,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005206458394255086
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4892615986438097,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00478082669472797
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.517895420103728,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00396951690057282
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3862528888679162,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005098489988602744
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.28597309969092344,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0043072728014668035
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.30171853631179346,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0039958865212606455
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.543624778660909,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005057179222803841
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.40842201733115513,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004421919280806686
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.4309807601003817,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037811422688738315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5754756758954318,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005127760313090274
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.429002529676783,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004411864254996235
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4548491812792949,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037688985484623095
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 14.068982889672569,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2968197877026963
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a9986e3bca65ea5ce829828b17468a8a0aa29dd
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6543123312997964,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00526064036341449
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.49725103123595865,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004700396819130318
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5246301036110124,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003984503778317276
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.39287572396976433,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005092635515157949
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.29395189402830646,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004329936715009244
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.3084715156757791,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003984840568000891
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5442321756917496,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005097259696532552
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.41399018562464335,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004362821518175387
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.43472057623746324,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003783613140791061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.576260208132338,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005175720678552694
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.43462623702782055,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004330850511626005
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.45859263436626035,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037604784900039713
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 14.5257757902083,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.27934070667678
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7df3abecd5c7fd6f191907483ffe74ea609329e4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6552722187848778,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005121690444471356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.5014318839163389,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004841314732312051
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.528998961079145,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0040111986437856685
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3954692599951466,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005101898040784152
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.2975293972498264,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004380627560293482
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.312329889253345,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004020073838458014
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5462714187744488,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004929822164408306
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4197087461962633,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004508347488373704
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.44005743632484523,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037961990367279827
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5782315118963006,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004987944243191088
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4410697720893144,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004507508365954698
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4641416559814427,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037529394806257313
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 14.647692432598681,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33108061636277397
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7ca8cdea2b47aaf660ddf5ddc22417099c8b58f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 0.4175225760583597,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0396891909821717
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.036154702825363164,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0009818080564093425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.25194575603722763,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004063756391959949
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.05913240499474725,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013929493617710806
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.008622739259285814,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00043823119869464177
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.055070843900621946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024967396779178577
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.014032525290363921,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0006860882714756988
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.03319248736743588,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0008139270777232365
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.24076860761446003,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0037075484547642234
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.054720707618427684,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011645928856203625
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.028597579926765544,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0008872776325204384
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.19465530426339228,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0036663974513027695
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.04627650881911135,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001242868453881384
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ece680e7601f47de77b21c8823c331816981cfe
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 9.081272600699627,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.402018320394695
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.5220382053968772,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005987738267175794
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4191646724726185,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048305024526799824
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.41551598976024473,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004296023954233353
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.2775801888317173,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004986567944186219
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.21926871203257606,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003987182862859094
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.21578393575171176,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00370107800929059
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.43396513758122374,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005410511105027679
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.34987365551609134,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0043112716180312175
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.34407708842598816,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003816657058260445
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.4610543113184002,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005584357166355635
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.36912334523668233,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004405007112813828
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.364590803961402,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038882985510075604
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..23d5bb2b3b2b1548035b04e23024df10c1096fe8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 12.827723467368122,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2395104136983248
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6429253445241435,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0052682241566706514
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.47000703561019436,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00471475669926654
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5027035099281697,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003999041411432061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.37759250826943214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005149133097785554
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.2707511593118337,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004165481663439759
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.2884428300738137,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003938627847598879
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5371248445521856,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005058812589328822
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.3926829496172771,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004369603703165467
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.41815460445037017,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037834517040888243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5661267893996395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005155408397958708
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4111001137384524,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004388595491116961
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.43908602759039184,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003769542857471681
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb6c1129e7b265d7f78c708b4c979d6b3914efe1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 13.74983515740776,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.3369336643179774
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6663828985557976,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004969343558059818
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.48515678680621765,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004772500474063418
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5239717326093746,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003941772216734925
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.39983932213666673,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004999905968285044
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.28561981827808197,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004259780327670949
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.3080090265553984,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0039734279794513645
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5573959843203435,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004875599891523969
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.4039560399628867,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004427731556101224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.43567806892379524,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037887038508846284
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5892685091559403,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004925427939501413
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4242330575924514,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004379788059459432
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4592396211258998,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037316704654461576
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba31252f1d953dc6bd0461ced70575d90c2aa898
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.110843003647567,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2511645721431234
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6682856998360935,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004979185134800539
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4905544087754697,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00468108576365346
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5296487843403124,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003877700877449371
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.4021464811436638,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005072504377728981
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.29219879155961126,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004343402687570785
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.31329482376051826,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004044426728834422
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5565192816989143,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004900320383045677
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.40974574613193676,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004412607746565604
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4403844333395566,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003798990173365922
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5890267104817443,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004933639417706148
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.42919808145201893,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004343745851162966
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.46355424353344893,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003709534998103682
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..60f25ab71f82d7f302912b29cb9e82d3771ff5e3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_implicit-graph-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 14.865246851977174,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.2469951733822034
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.665414040691734,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005062732785653125
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4948589439641179,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0048064787252879194
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5305017223370667,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003988654528152028
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.4046994581842311,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005111935694290756
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.29461815114988565,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004314404789364512
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.31528241951341845,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004000144045412803
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5571982616459412,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004943046835934418
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.4138241225793445,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004468742674023455
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4421868232457002,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003818602071050544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5893685655133948,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004992290666708423
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.435144330111697,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00447532683122944
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.46632191073623974,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037788576614651527
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f79d316891c182e97b5dad1a3a14969f8e4085ca
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.041325640272890894,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002560445046946543
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.19041477540959695,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004524197400485002
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.04935966890223983,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013619520000937328
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.007548030574657656,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007170032082943574
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.04879535050018004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0025350606449171076
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.011630773862436284,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007179556558495777
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.04009329623073503,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0025173115168095183
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.1861979765601121,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004396398136666792
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.04776011784545986,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012826838220057924
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.03614944318759725,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002508457252552527
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.15867343894687855,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0038860218085092343
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.04098044832889336,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001174299018944812
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 0.18086054928747558,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.01999596863932189
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..18daea363b8f12598955b540247d3f1f86c700ea
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.48796554161260514,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006037067740666733
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.42982516412058086,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005042143982997805
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.40276036162942674,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00447207097628362
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.24998961495361918,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004829698568164138
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.21846457768780272,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004069803208873997
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.20232455371188282,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0037130871793239284
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.4013731521366426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005316473393809085
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.35652193084587386,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004474375971994101
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.33023329964113135,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003891418836388668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.42696683460990426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005554491153938951
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.37572008067001184,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004595101885912417
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.35033460066099453,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004035051365542008
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 6.34872820947942,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33668537119149733
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..de338df76f8227970e21ab1d47d4c5349d0f4dc3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6220966532720777,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00556098058005664
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.491700916790073,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004899266870189194
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.49922840734492113,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00421342033950754
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.35157330398460046,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005016170583236788
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2757523128127938,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004283617860672476
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.2773940114781611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0039115931012414
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5072542016868399,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005141659470235974
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.40406132924633337,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004458244442063251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.40603118525029447,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038085025798868743
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5397172744667709,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005262171032735078
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.42514118811350743,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0044675186943622935
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4300464517574411,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038309247796264817
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 11.432615162808808,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.6195135716025121
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..24c48919fbf0b6558d02889e8136111a9529d511
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6424519756213152,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005185084062837668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.4975396288082743,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00486795443028984
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5182163799306218,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004028853507439458
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.37021817845747296,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00491121715575853
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2855540858782935,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004340737622712597
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.29522310783911515,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00398082279125943
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5259444546489896,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00491487662373786
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.40982402267867996,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004483163518003983
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4239367573766619,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00379828921652199
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5588298068318703,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005000500827487751
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.43118988673283126,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0044827837991875796
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.44843024559472394,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0037854444143360735
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 13.427423242082552,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.21378430345614655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..685b2f5a5a0b9d09a45d45a14d2fed235edf612f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6536854686117716,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005055038024884472
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.49481801273909026,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004774469373884207
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5229740272347854,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038958904995235783
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.37832323284536284,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004895191350276094
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.28403935935102725,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004268699115858015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.2971667864152805,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003870964512172995
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5331243023450667,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004788026817581983
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4068796874397253,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004415817647854088
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4266743694870892,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0036930292492481412
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5679878693884904,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004880518673065624
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4294770442196603,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004389909285885383
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4524753167873619,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036313591386608456
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 13.213076094213868,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1911365124809486
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b982c69295df96ea6172e57eaa11da075f9ef9c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_non-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6562302674293572,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004978213958757384
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.4993162400355719,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004928232006071223
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5268008342959368,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003941366188303182
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.38281434184302715,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004993242918072653
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2884911653993333,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004386988321702633
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.30184258420473004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003973705493904007
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5369344693567049,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004792429508511601
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4092001227605644,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004488279114524377
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4297537206934182,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003720711472388013
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5723322848404121,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00485312902623496
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.43323580321413074,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004506713763132942
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.45631464166756985,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036619891881959094
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 13.591065355686462,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.33251846754741937
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..33fb7c23c422b07349c20f76743077bb0bd72c69
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.06529894179611363,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016701036636380111
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.28611265999014596,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00466329131578748
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.09397440376869791,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001967935502764586
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.014752341752067778,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007834696465856099
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.07635223264261566,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0029933179291138446
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.02250038672501546,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001081320815406387
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.055742312174539926,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013139324791962169
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.25613180684246944,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004131606370455397
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.08117813588062889,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015689150539593354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.056738696823048364,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001510447808113087
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.25166816812712434,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004300637878529783
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.08140414401609677,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0017706010471645551
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 0.45356160811555557,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06126183633549046
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..97409653ba498d38b474a41837915859ec9f29d3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.4751602143647354,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00556979352231675
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4529875909582768,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004950307941168847
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.4149721611957904,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004307580646927434
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.23961216679613442,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004369953215662577
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.22983670660533184,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004036151644367376
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.20685824798380176,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0035445318812293513
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.38776635650699914,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004900676753268663
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.37234842080956987,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004351586149131922
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.3372488007777988,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003710311545338347
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.4136295834528666,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0050805763062877
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.3939649739250141,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00446027620490063
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.3592040570850069,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038367843690821552
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 5.946501202386316,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.24351717513713797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..11682ac7f02aeff99f4c3686008fc7d5cd8a838f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.5846333211355946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005740649275229668
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5013225392819405,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004823682144003579
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.4874115938302624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004296628165610245
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3264068039295695,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004964606354905216
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.27678176045081004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00418664596670321
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.2674108062793395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0038746688112602326
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.47701281240617616,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005208435147495593
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4135138973656255,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004400914707960098
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.39705535267405395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038555106605338767
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5089420537199473,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005378042598242251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.43476269636565773,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004386158883269227
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.42141948798275997,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003907811087276578
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 9.556195074080163,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.6010773991832432
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..20ca95bf7166ca08d5ce4697dd91f21d65931d1c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6211961468572741,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005451043928857081
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.50338330282331,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004845151895746252
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5118474084220225,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004211249350411945
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.35764468264912685,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005000680935796298
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2872787024309365,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004370883087249544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.29116539392687324,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004106307235430533
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5069648439596643,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005096748917589604
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.41279400233423746,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004428779743643375
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.41711953001787855,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003920437530291511
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5395270078765012,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005190497474117719
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.43551143891048294,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00444172146138816
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.44211630816776704,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003913688068087822
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.602247258212916,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.5603144875513827
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..927be800bb142e2c0b070691db80b339700ca1cf
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6366194970225395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005117591858573804
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5064153790661966,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0047764966262347315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5226187226330231,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003919125144592862
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3676900136060854,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004895179178207714
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.291575080362687,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004386520481516148
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.2978772889798726,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003939061542109874
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5197169335498214,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004853089161070492
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4156312087117916,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004450919756390418
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4260200185326371,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003725170111967127
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5536011748870678,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0049218194275952665
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4398531089513908,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004458785918722786
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4524894725742852,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036906432733459713
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 13.861907897053662,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.21106760860433635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4aba76ffd82c50da08c607842e1959672238f39c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-web_nlg_en_very-explicit-description_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6489899322749847,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005165184579861035
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5065958613699326,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004851662674645652
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5275749184040505,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004065463426159502
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3794125655803978,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005037370423173522
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2915521692478834,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0043068877311057425
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.301972918522207,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003960139499843082
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5300511231875715,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004912600312705692
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4144029433426168,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0043879949295119345
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4292023909929102,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037453014462275973
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5647334651592919,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004986228066141274
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4389205713401686,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0044359561894050025
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.45588057276344957,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003722166270754704
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 13.715865491939756,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.27544078012455814
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..463b3784d3f43839bb8b0cd12d0946ab85cec1c6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.14151848718302718,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0022335651457076155
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.24752522343029543,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0034583124706848373
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.1670780364841084,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002317030768961082
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.031513222807695855,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008276838755083649
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.057870603612401005,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015707399503954848
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.037518035745726125,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009219458435218357
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.09919806351581549,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0014930174402303853
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.1804489451534078,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026185168421145825
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.11838776198838015,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015710210393614862
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.13052622432432498,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002064356881448907
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2290141140670916,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003219882829713742
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.1541763589711041,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021393511225024746
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.100034714712357,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0935734566557911
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2130923fc8cb7ef6d1471af0942122aa9b6d289f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.21496359349054053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002755256183241888
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.2977766333640761,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002926238063431334
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.21945211051932914,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020025608489888857
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.05319078037486938,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0013739477379752152
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.07248231514348752,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016065512771453777
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.05267796473203011,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010630123803576901
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.15474418633419101,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0021100703969687915
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.21701386393503583,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002266202639453047
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.15695037484665128,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013801818301413765
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.19867342000677637,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002582375862443155
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2759880519793925,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002754502157352143
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.20282457983589916,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018679736258282954
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.663367618392156,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06404842058801877
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..489c85e676eb847c9e261999e7053b7c39a4c109
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.25214239134804445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003137662714570643
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.2925282850559076,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029278623218607024
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.2320888922471032,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020359189995027894
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.06893560361669435,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001724674687618289
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.07621541050409261,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016383981894790049
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.06025475825878314,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011801337536938873
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.186139567553756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0025221021759129925
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.21455509093210598,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002256479118960565
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1689405481199798,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015013829962533184
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.23421093467716025,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0029753113494921965
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2712212728272204,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002749323743170995
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.21501937359842724,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001910276437232937
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 3.204510495424181,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07787891887551489
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1002cad3be210648f08070c62cc556d56674f9ee
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.22851215436123426,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0036662365296823663
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.2318184716721262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033564149292511568
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.1924533187886367,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00245294080699731
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.06394913109289467,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0020246406144319314
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.060403040894099265,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016033403202768616
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.050085198320322696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012243114894951602
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.1730054498692057,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002985153102701943
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.17225807912085475,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025651097238941806
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1426385624538846,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018324750437799981
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.21351220553283318,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0034848884034780077
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.21522664273311504,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003127002658550065
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.1790464884859616,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023023834259245433
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 3.083887671263152,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07310310351916077
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..64d66f6beabf7bb4a323e10e0caa7fc1bcf43e8a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.07835172075365858,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0030934416870520387
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.07356972793882564,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0027423636784068436
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.06162695012756539,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021686829458294928
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.022811004840934415,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015155046141358778
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.019466079963312354,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001118722193367272
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.016346805633122616,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008821546979197065
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.060220939468153434,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00254445620539224
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.05425129446487449,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002038470215681771
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.045627479104232455,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001618799146251457
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.0732763918547995,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002947411739298198
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.06755598716044736,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025110067465870718
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.05685236668461361,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020019069945384667
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 0.1862301901852318,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.019321081401488156
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd2f6181993d6fe18a3f0e3cd7dd4f2a1ffa2aaf
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.013258396168294844,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0014673465961484976
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.011123987415724398,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0011531151327320928
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.009675821577797613,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009637023261395425
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.0040653951508634375,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007388608602909922
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.0032335182862330307,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005205157707758816
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.0027133544163391752,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00038619181882107434
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.010646328299525897,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012705323854153526
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.008682769460160939,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0009333010065493198
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.007456643010130254,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007593140109279754
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.012564310140686598,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001415778942442465
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.010438016150849468,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0010937333492898294
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.009044275613373771,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009050612725574305
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 6.622577066013761e-12,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 8.422075320916485e-11
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..397d10377b79f908d2c0f9dd063a00ef6410bdf6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.06164473535235492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001405426454029671
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.09131698896316866,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0019207087843768908
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.06747377008609552,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013675427733203485
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.006790182377862598,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00040039637533508676
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.010655783228026442,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006561371102920868
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.0075404220501442305,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00042912197634379046
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.054017605881063746,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011708433559756229
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.0806353288622824,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016043982386113617
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.05916486574473254,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011048168711011383
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.057274185187926,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001306487045995208
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.08527500630924882,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001791516197068332
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.06273421540506535,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0012595672988991648
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.43426362863975315,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.029904347346119995
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4c51ba1866df0ef430bafd847791a65444a4bfc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.13567067805697516,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00244783829708697
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.13624606163578262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0022994798156016536
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.11604016621744329,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0016757575785151973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.018250954077339773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011620613888053647
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.016843917270409188,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0008988934545392962
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.01382140392181167,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0006697623363819288
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.10820701273702638,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0020324978288905455
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.10754510337495378,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001775180479340234
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.09122973644360803,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012546408754974406
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.12961910351952088,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002334378178824185
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.12999728653531273,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00216120080161998
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.11080814949606399,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001578435475966608
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 1.053879603834425,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06162451398173484
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..df86e61ee60915bb6bf59a98a4e1a719d5f80253
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.24296232579509847,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0036738254033485385
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.2154735961186183,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002951643463846921
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.1899984072177498,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022467111119683423
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.06583133042088486,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002122690639857749
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.053729671295266325,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014610122704433743
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.047456671275884474,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001201924712858666
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.19235125113808507,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030450441817123828
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.1689714728400649,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023296670488581668
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.1482075417640661,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017139326615536051
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.22948210547962722,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035338194011097584
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.2023936742681393,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002773600109323121
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.17874509637448255,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021236174797339756
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 3.0768418397866837,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08871699114579137
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0be9ba6e29122df89599883d4ed744a7e6698b7
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.2367568945386193,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004029356352539418
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.1872797276779158,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0031039333230205887
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.1730409097841582,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002502695385341917
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.0682055839178317,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0022669198450948843
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.049755521305723714,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001464354667787732
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.046145260990371584,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001243064106408716
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.19013952640409754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003394147032570451
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.14748074449453355,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002454574356183073
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.13627482491549403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019604976905496277
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.22321388782734303,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038594191530477748
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.17567553909004804,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029251708582556098
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.16240071165527958,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023599640917498998
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 2.5833089949318944,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0845794278500129
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f9ae57a10e47f0825d8b6d5e7e9f3fd07358983
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.0826705327963426,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003344790852751325
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.0598723767707158,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002421888232149976
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.05622854248419195,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002109056978999073
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.026818667629535033,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018242436179135937
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.017564451176379225,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001098422296088496
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.016425733148301386,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009348754957675916
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.06803138505166514,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002872500027172234
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.047513674740425234,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0019289443522760462
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.04485111200457293,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016806426649935266
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.07839317558374391,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00321469710365829
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.05605017651829644,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002279210434399837
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.05278414549118305,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019875613494061855
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.08092191325240405,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.011429683298761291
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ff18bc0db8edd007c9ac3898045ea32790ec516
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.014632085154467636,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016062592024331424
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.008280526295669806,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0009222495294936597
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.008375721168007073,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008677802247575033
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.0048406013014464944,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008704683931109317
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.00222638820794996,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003520578406679729
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.0023532242956938723,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00037781487436313303
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.012386669107254842,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001411967038983594
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.006765656854714533,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007480585879119922
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.00684440636036312,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007088169655746148
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.014102741266491818,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001565517437796769
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.007844713254559151,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008781423555840759
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.007947675842187496,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0008241812272510267
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 3.3774607869837585e-17,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 4.741357062272074e-16
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0798ef78abcbc80b97964ad22a92ba8e43c7bad
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.14123865982295705,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0020202809729221675
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.22307005586506318,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0026843151043358018
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.15897173803983955,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0019161847895579139
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.02549989877808478,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.000721064965974684
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.042454775928502095,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001263425095394224
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.029037224536015666,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007750760929234713
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.10788620148520663,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0014459893492170886
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.17577354713606208,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021009355572715457
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.12234436969470364,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013376679737405277
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.13146543271367658,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018773489651251884
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.20864419850100588,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002524852725911657
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.14806558080664062,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0017682134567323948
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 1.1886223689461488,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04424171563604672
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cd3262bced3e675686e1ce2c9682f7e88ff278a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.1257754761068784,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0021554530530307884
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.12439497879445872,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0020706655200342433
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.108101701701726,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015518401988124507
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.013364384862998537,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009762375321509258
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.012700824707301519,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007318396121340969
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.010536736032612563,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005708911085444898
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.10102917482089223,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0017876771518052959
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.09927362783573078,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016105321153965717
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.08587323698176857,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011828740918808
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.1205631392182017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002046685977843568
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.11957668500631019,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001979489711031859
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.10374596670283773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001466352422028802
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.8511460277480039,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0888846234785566
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b96c4644fb23d675912ec741cf114735dd8d49a1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.2456527083977364,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004266235274873931
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.18111674598382005,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002792493928605254
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.1665670097445064,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022225643848745908
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.07134765002917946,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027529254958689104
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.04279317497612642,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013473083232237544
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.04013034602187669,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011603724359419611
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.19750190808423526,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003722471595541706
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.1410130748053428,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021637107870954935
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.1296562897314334,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017086381457813544
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.23277149049914897,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004135498321772824
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.16985926116644126,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00260288060701269
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.1566152490230738,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020971815171293004
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.4589494304991546,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07199650932201095
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..38496d91a3711fde18366e31fcfef551b6a7d887
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.24850283041195112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0046336444774253555
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.16076069150010056,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029394946336236075
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.15523387532764027,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024315620580638354
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.07693886867031917,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002930517622308709
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.041690092863002085,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013593030670533894
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.04094764568299239,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012144142736985179
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.20269496412708723,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004051291594703973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.12562253037302593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022620290785844077
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.12195613865782994,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018817813465346709
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.23597732477141262,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004487490296782531
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.1509130671270635,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027512008497031313
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.14609016469928118,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022937337012322694
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.043763896129358,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09423626302294519
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..52a377564c2e2ac5b4cb357568d2ff3da25753a8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.08108654249220046,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003456659471152361
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.050899313397824254,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0022355417372530366
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.04938144254196313,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0019543120580950636
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.025696529427655938,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018788007092688267
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.013858832830745406,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.000977245425782863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.013439239930666353,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008104222029332782
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.06722277079505945,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030105977885906063
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.03997350902589808,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0017457034603758368
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.03922434499043311,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015568603703032396
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.07726587114739103,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033370296423210406
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.047824936820165094,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002104240678200406
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.04652217653531876,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018423780684281958
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.025383968929684336,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.004535204806063187
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..eda0e5ecee589880cf06602f7c89c048c48c294d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.009996893133935812,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00132769559493904
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.006640185975806153,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0008719308871004512
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.00617290599529864,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.000754682849470479
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.0033897673826744925,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007496264574720278
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.0017024153835973313,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.000337089156117355
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.0016426619083960594,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00031435695145720047
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.008527012867083883,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012057970660817696
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.005349623575215652,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007031830380440492
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.004975083635745968,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006128478221884961
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.009603533312820866,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0012952829451305296
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.006297727244644937,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008337320152137014
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.005858547923035401,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007214266745845214
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 4.1376706030398795e-18,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.7050554509881984e-16
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec2600c0139e88942cad542a26a393a723773f2e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.1325770613482503,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0025192320948419973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.20000361049618887,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033883319122602765
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.14568887330505176,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024304012352800524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.030157012060673342,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009243132657339652
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04736363206911103,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014655640703641456
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03380900442536606,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009820401088417524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.10196660477211375,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0018957212855749772
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15791691922428616,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026796281489341964
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.11261046327516765,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017887243866971374
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.1229739908341885,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00235776548625545
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.18595911596519776,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003178608298694486
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.13507023756955555,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002258433462891094
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.36187522199539,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.13845059941627463
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb2a3730608b09ba980e63e0de5aaef49fd0c0e8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.253166878261391,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00348988815268275
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.24765343018422936,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029563082706124282
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.2120115141592031,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002201889759622243
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.0699683530277847,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0020960827184197486
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06305007691775547,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015388476493559788
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.05447927752503051,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012618043977617596
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.19452273753171953,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00287285443881589
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.18824812828488288,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002306801191665917
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1602275848002253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001660994268757166
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.23777364194884903,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033469123908777755
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.2311323526084588,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00275352833642066
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.19811964314605882,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002066579949503973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.3670705128175316,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11604750071877634
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..079e771fb8eb01afb8258ac9e7edcb8ceb2fc0c9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.3042280180045783,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0036548780493867858
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.2657722896569855,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002856010555746096
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.2423480992295036,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002213386982039197
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09313769731615627,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002310899996162063
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.07567489802793778,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00165355417900699
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0696826972387138,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00138974451583577
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.23678748519073842,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030313841471723817
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.2052468833712137,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002285031155511772
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.18663522201860716,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001740118367633801
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.28748422717826916,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035473688052382567
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.24946630704943015,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026967966099014384
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.22792203815398218,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021140458556974436
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 4.453155894580253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07102095729922746
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..61eadbefb42e1f33f01e0d7d83089abd05e5aa6c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2674845496647863,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0039889863741841
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.22160894672160433,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032468878599473164
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.20630036414089614,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026291569934361863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08211313390771198,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0022759780285685387
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06436903346497445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016476213678321223
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.05970860054565866,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013699064127955533
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.20900633859522688,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0032474779669193566
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.17205068176936356,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002592219077108911
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.15931256883075381,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020301040819156045
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2532503820549326,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00383540860381719
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.2082619543002508,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00305787284903526
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.19413853066180423,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024798764798439764
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.3998244876185093,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07879957613227935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eeb97146a6c62b02abfc1334afe529a382d33aa5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.092208189110962,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0034189214358844324
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.07224586073962391,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002716708158950956
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06668350472619872,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023223709305170443
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.0276141917171982,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016764660911735634
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.021177864192540377,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012371206877780988
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.01919932938843845,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010049481476114978
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07354514541641334,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002828654152146427
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.056562703185008854,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002168458322152195
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.05216028088524061,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018383425985936733
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.0875093402820744,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003281948169701787
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.06780620805618982,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025609050791819783
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.062781251763141,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002197731580536598
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.13483879286922137,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.011222185156728685
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc9a03320d08fa3e99598624e13db9da3c818c87
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.015272421462795964,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015642398537380597
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.011340665089531033,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0012049966779817216
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.010526309548207395,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010240475142557462
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.004861708599486756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.000775800963972687
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.003729726793524184,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005770564824763594
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0032474649318922875,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00043456520285324997
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.012655622009735225,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001358908157095457
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.009170085952684662,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0009929814693818242
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008507384832807946,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00083698404273872
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.01465587941098198,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001510887749703006
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.01078936789465723,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0011506077585367758
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.010004236605713649,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009708230001026674
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.1051284504073666e-13,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 2.6395896486810652e-12
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..79d5a2ea74575468daf3970966c308107eb56f87
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.09059810453686068,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015056342386344352
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.1275422175021168,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002161826704046198
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.09697584440146213,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015059466263845065
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.0074501677100613254,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00042807734501380953
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.011696196090971528,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007203986479520007
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.008274258472602202,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.000462953442764119
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.07617050446768113,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0011196210281897148
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.10846177006819308,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016681767488937356
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08174820152892148,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0010962242599486596
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.08401444296488689,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001399793796844978
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.11878346946127412,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0020229669439021116
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.08999801889182339,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013981813175196261
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.510768349225832,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05232532369609947
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3daa200936d407bfcd9eafccc614223baf77029b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.11804797381560325,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015596770844967052
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.11938733271917377,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0016474965429255735
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.1046855555474759,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0012419703179327097
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.006483865876805875,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0003960380307594885
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.007429698874370853,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0004988250290226641
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.006046177285954043,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00036102566099162976
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09444605727198088,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012149153617127752
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.09562634721888393,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001293071021485607
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08322112417343631,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0009222719266399656
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11382960740162716,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00148706494623472
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.11526021094661845,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00157453107301123
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.10097615314050218,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0011811528071622318
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.5467096839704352,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.040222632718890344
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..576685854a2c8e4c75d602d6c8aeaae6d946c9c9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.15227977915443636,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0023301138728988053
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.20288951458698465,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0026600382652853857
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.15231216408550322,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018601662730155298
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.02590295524115366,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010945836739744803
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.034150951995566774,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011817222469903002
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.024920583594549812,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007991257423072374
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.11488386720934148,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0017873853840310852
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.1542948926217176,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001993853857137535
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.11411475231199182,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001289095544101643
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.14358124760794166,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0022003147816700226
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.19123947411527695,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024807666639582162
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.14348417776161898,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0017290483390471727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.47208900301148,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04893525563104916
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..76bfccbcd1652d7c8971ce0f32e5867e0d90f990
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.13714187658215166,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029978515027231094
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.1608525267779809,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002968280879200802
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.12327064005790836,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021375429901711062
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.029404520837665662,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014427560961137382
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.03107950418628417,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011977089285448693
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.023889029493667138,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008571997910771259
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.10640730246971045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0024200277079664346
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.12448069660646223,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022996154759259723
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.09418628790342458,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015811319768149715
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.12921646674810666,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002856413380294782
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.15106265513486006,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027769655757646137
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.11568545523374031,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001996893570244593
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.645895957917865,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0936463946325968
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd961039d8b8ff80429538c45038f5a3b90a1a7b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.03827905412528618,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0020579127622226234
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.04203913187572869,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002047851514306227
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.0325446644737222,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015222836283055165
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.010034383577420857,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001074414724889592
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.009208465166841484,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007685505603330088
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.00712730989622718,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005315862223791575
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.030774425912045378,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001732074858974945
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.03310013864383069,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016065163081856765
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.02542231281139547,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001170867527428399
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.036121977078342034,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0019666280693929816
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.039179240084264884,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001899803034136687
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.030420087800970795,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001418424307914252
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.04745666192513143,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.007477301550260494
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..55fc8489d6c1578fae6b4e21f945ab95bd633d7b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.004178730643014739,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0008310089172819684
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.0036527771749499754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0005975815933235252
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.002971493717909892,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0004944141714386947
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.0014207035995972678,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0005157762753599669
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.0007326441133672487,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00017937753316799973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.0007222549546163281,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00019897321052266074
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.003300054936861825,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0007057934923975211
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.002763617912706572,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0004345967221402754
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.002227968389907064,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0003632109933122061
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.003989732174305017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0008013379409808136
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.0034524072384871265,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.000566274664103085
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.0028157162884264765,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00046819601162021053
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 3.148083063194154e-22,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 9.742672408621782e-19
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d8d186686c9b7ec16a04afb7494f7e431d3c85a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014933117490932577
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.358,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015167928865407555
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e8877e9277958c9701f0e03aaf042ca6a7650b2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01497675877162034
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015039986742055235
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4cee29385aadfdaa23a625f5c2d7622871a8b3d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.353,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015120172605483699
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.344,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015029633724408947
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f01668d15cce025c573ae1d87d6ccbb403ca125c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015039986742055235
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.344,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015029633724408948
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7a5ced3b5e34b89e0b1c5b9e926aa9d79441f91
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01499813134840271
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014987482264363937
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2c9eb9812e5cdaded614c714a7914eb86199953
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015008706182121728
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014818724459095526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb8a74d7443d48b0996f0b1fa924f68e7ad52355
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014933117490932577
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015316971293620996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2c96a802bdd701308f970519078e9bba7020b5f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.343,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015019206922356953
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014998131348402707
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc966cf7c4f156ebf34b2004ea4d4cbb18ec4866
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.361,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015195720118175113
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.359,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015177264224798596
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..485fc48800b2fb4ab83b90d7ad23471703ddebcc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014987482264363935
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.346,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015050266127564438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9576c9fdecb7dd47fa6510dfe3e2dd6a54bcfb9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015039986742055237
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014976758771620339
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f956412b0e3ee69a77be0dc0b367a3304791ca93
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015008706182121731
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014944140233795023
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea2d8a6e1c6b01ed48a32a489942bbf9ce35e799
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.353,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015120172605483703
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229857
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fa229fa4b82a047e378d1577e5341a542f8a930
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..14408ab0243c53de8da484cd4f4327308954b233
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014853842487270334
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014830507204541031
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bf9cb116aaf6ec8eb207403a3ebdca6d2e34df4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015039986742055237
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015008706182121728
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2e22a8df346b924afc7400d60e8d17a7d5e8773
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014758652303574885
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014830507204541031
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..16d2cd3fe633f2a626ecfcb193784fba46514ac5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014758652303574881
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01470919305605713
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4c67499f745b15213094a8cc77dea1d54ae63ba
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229857
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.312,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014658474370509005
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..061bc20f22262463d29440721c2ee39eee177abf
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014955087918653603
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014976758771620344
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab1e77163b7a8d07718d4f16837a353e5cad86e5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014899597242811485
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014944140233795025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..61136dd5bc9a3cfc4a637e9a167729740d4267af
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795027
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014899597242811494
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c96d7af3ee08ae19572aa149a14b5e939beba2b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014998131348402713
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014998131348402714
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..84cac6629bf873f68e28ece2bec791b18d9c1d36
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014853842487270336
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014876872027456736
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b520e965a84ca69d345231eab5c5db1d2cb67464
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.354,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015129868238451772
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229857
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d08ec6ef33a2dd4aa16717daf1a26fd24cb3c0fa
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d65a0c4f5ebd45fb9e960cbe5a441e51654234b8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795027
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014770821817934649
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e1887e01031acee52a95061f954bb1c4ae551ef
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.343,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015019206922356951
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014987482264363937
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..95319de901b4359a46fe0e4ed52dd01ffa7a7422
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014876872027456736
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..02432e216531356f8bee057c60ec670bed1c4964
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014899597242811482
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.322,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014782913600996681
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..13a500c5971637313fc0ffdec09d9dabe569c99d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014910846164229871
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.361,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015195720118175115
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b85786bb446a76208ac276ed402b9399991941a6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014944140233795021
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014944140233795021
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9171b9856bacdd97f7a693f9b92c5bd08246864
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014998131348402723
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01493311749093258
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..988c4e65885765c6902cff2d04d4268ae78a1575
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014830507204541033
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01493311749093258
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9c187278b86214c6fb78ec437c85ea564f757cb
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01484221315341124
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.319,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014746404865473474
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..97c626366216beb5e629ce2e64af2e2fb9c8eec7
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014770821817934647
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.305,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014566646394664371
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4187865d416d101b9500a85ec908df01cdcfce02
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014899597242811492
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014853842487270333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..20575023cd7330bdfedf31923aeab198b9842340
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014876872027456729
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01494414023379502
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6cb4529c26a826c2acc524dbb4da39f386f1285f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.313,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014671272822977886
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..989a96f10eeda907ec628baa6f885810feab8681
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.306,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014580006055436967
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014498627873361427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5289a44d52f8b88d2a00784f46d4838956a47e0f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.304,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014553205687950432
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014498627873361427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bf8bee77209c9f7d4eb01dc13d597de77b98271
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.312,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014658474370509003
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.311,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014645596385722695
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0fb97fe510a2631c7ab86fb0c6dc5809206fff9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014709193056057127
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014888272588203933
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b705d7a8064efc4fce4a6f60d5b3e7d5de203a90
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc3ea6b7e26208b49b53fa7a0131cd509c81eb72
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.308,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014606483127342756
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.313,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014671272822977883
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5483155612ea1b5ad567b8cfc04a0299aca4dbaf
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014721675438880212
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014888272588203926
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..141e342e42442fa306bfc69fc088b991ec1c1138
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.308,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01460648312734276
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.314,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01468399195108797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce0a12db93007fcb17c5cfb93fc1667b03c78f46
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.314,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014683991951087967
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014721675438880217
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ed1c40d002a27addb8491117eb3f26111a6790a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014888272588203931
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014987482264363935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbacd925196a4fb9fd4071dbded362f363be3922
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014721675438880217
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014721675438880217
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae125347ce59cb33b5d166b88dff3f1f9c4197b9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792518
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01470919305605714
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..05d284f78b31e620848fc2098e59a5636aa37800
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014976758771620339
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014998131348402713
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf8fc9e31cc157e4e28bacc732eb5f6405fc99cf
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014758652303574883
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014806864733738859
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb3ecd5a02aea91e89ae923e14a702717307445d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014933117490932577
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014922019523732954
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7091d2146f1db79244fa9162b131b16cea68f05
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.309,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014619600977206493
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014865395385928364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a05cb5a61ede1e3974d237e8b2769039d6d879f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..021f7eb868634bac9401fc5783736951661d6bf9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014758652303574891
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.309,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014619600977206486
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a97bf81e004f1d5f5d82b3bcee5015882b7d0568
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.305,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014566646394664378
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792515
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..781e6a8d1468883fed20521757a3092f14bf865c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.303,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01453968371053524
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.306,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014580006055436965
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..474fd4ea73c4d8aaa11a6a311240883f610db461
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.304,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01455320568795044
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.307,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014593284892852628
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e22eac19a949be0f104f07edab625dab05a10a38
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01364760294240639
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3408333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013688600793296939
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cc68b5bd81bc4880fbc89ab74430c9799f94fc9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01364760294240639
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013655897185463658
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd4bef59a803a718c159e37de8479dd2b337995a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013508372867300219
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.32083333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013480882752851557
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5feb3b954da52c063195476ca831b0f15e3b23a0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013471620929769135
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3225,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013499258621103244
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f4ce9bc8e6bea227c29acf2164395bd95fda438
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.31416666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013405399314984093
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013579531277800925
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..47312dfa98aa71e28ff84e003b52e1573748c9af
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3275,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013553211167251944
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013526454480351018
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7c018f5339de44ac131ddc353875ccbb1dfbedf
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3358333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013639261190932879
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32166666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013490095282989521
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d9705ab05aca531c32cfe6ec249042479f89636
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3475,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013751753243291854
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3475,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013751753243291854
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..980655bc793ca86c0eb5fba054805ea6cc49a604
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013696658778002515
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013655897185463657
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c801a3e4052b886eef3a2f6d96356527f5f2fd9a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013630871843821474
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013655897185463667
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d976f798be1ce3ce8582dda8df04b954fb025835
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.33416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013622434813136774
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01364760294240639
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..81418a4890f2cba6a742b308e4b3742e020d5807
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013605417345710526
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01356203291952902
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e72146cdb4eba7f26abf48e7789fd835638a883b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3333333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013613950010225605
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013588208070709002
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b81cd684e9d38a450a90727c249fa092da2a90
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea276fe1c27bde01e68d2d643affeed664057f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32083333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013480882752851555
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.30583333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013306526255831156
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..89eae48aa91ce712610369b61ae279d9dcb2097e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3275,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013553211167251953
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881629
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8edaa2c4db646f4453bc47bdeb222d43c9eed87c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013544340907003663
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013526454480351018
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4506addfbc1181e63d9f18a98ff4f14e06897a9c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013544340907003663
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3275,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01355321116725195
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4127585ecd9ebf7a4299d3714cde6a951941329c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013562032919529017
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.30583333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013306526255831163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..39dcebce179fb6e03772efe003c84b367e823032
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013588208070709006
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013622434813136778
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8b87fa9f407ffb7cd55b9183ff33280ed1404a6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.31833333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.0134529489969963
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32083333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013480882752851553
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..48332d8b15a95128b3d4395222b3c8ce10199605
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013544340907003665
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32166666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01349009528298952
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4e410d53b5e56c2928a7adc5bbb49cd71824488
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013579531277800922
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.31583333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013424568830356443
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf7788095fc665b3b70e6a63da77c8375bd7c7cd
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3383333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013664144006618268
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881626
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..503b7dad76f6e6ac09870538d505a3942235335b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013517438120881629
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013622434813136774
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..562937445bd35eea309ace03f94e6e238d04b642
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3431246dd333b48b2c31341a6dcd98b11347327f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3333333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013613950010225608
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.31916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013462309712005134
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9db37d7dd6fa2f8640133b06954eaa8825ffd24
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3408333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013688600793296936
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..97ca1fd9d526c233fbd8aed24746d39f11ef8a2e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01360541734571053
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013588208070709006
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6939dbb9751b526812c58d3ef771020d94f0e845
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013544340907003663
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013655897185463655
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..726a027fb6e3bec3a878ab59c63cc8c53b5bf4c4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23122866894197952,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012320858834772273
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23122866894197952,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012320858834772273
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a1c32c29b091a71a7b28ac00c39a1dd9a9ba59c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2235494880546075,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01217489663120261
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2235494880546075,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01217489663120261
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..944d8d0e30e51735c793b4413346c6b9373b4a57
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2380546075085324,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012445770028026206
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2380546075085324,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012445770028026206
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddd452e10397a57ed742c544fc87bea0561a5950
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.22440273037542663,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01219140493860384
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.22440273037542663,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01219140493860384
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f5cd30cd2242da2b20fbd0fff758bbd43b773f9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012476304127453961
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012476304127453961
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..58f59551c6b1f9f61127b4bc2edbcd3ab22a47bd
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23464163822525597,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012383873560768666
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23464163822525597,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012383873560768666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..619aaeeb1abc22331dac456936026882d24316d8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2738907849829352,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.013032004972989505
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2901023890784983,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013261573677520778
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..03eb5e092497b54ad06f4055e5fa3fced8ff5893
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.26791808873720135,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012942030195136414
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2935153583617747,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013307250444941127
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5e771d8cf992d94c62764fa90e5568234d138c7
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.25426621160409557,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012724999945157741
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2883959044368601,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013238394422428162
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ccbbd328a7e0e097a7c1054da0b8052109196612
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.23464163822525597,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012383873560768668
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2815699658703072,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013143376735009014
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..529046fe83d3cec5a7daa377788e4195ba3f7ab4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.24061433447098976,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012491468532390573
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2721843003412969,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013006600406423706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e4120bd8e1b83ca14aa0d5a3f727f965073c438
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012476304127453954
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.27474402730375425,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013044617212771227
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..45d0ef8b1f76d91ceed0965bd7c789b2e5d28173
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.23976109215017063,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012476304127453947
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2713310580204778,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012993807727545794
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dc6753916ac9cd0de9e96e55f4f3d2bddcc6ae2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.23208191126279865,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012336718284948854
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24829351535836178,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012624912868089753
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..51a1d251b8ca94882bdcabf7553c388e189e555c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2363481228668942,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01241496052430184
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012610352663292673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..332c75d3ce3e3eda01fe127ce477263430f0227f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.24573378839590443,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012581033453730114
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2619453924914676,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012849054826858107
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f46392edcd7d8c0ef2735dd911a5387bb435d344
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.24146757679180889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012506564839739434
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2627986348122867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012862523175351333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..973c32da63936880add913cf261eb62ff8bf26e6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.24658703071672355,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012595726268790115
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2525597269624573,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012696728980207704
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..37fa7dae01273574825cfdb4fdeb58dfa46a73fb
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23122866894197952,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012320858834772273
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23122866894197952,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012320858834772273
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..94ebcf16c651634e25eb1105149e431e1cd8c5e0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23293515358361774,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012352507042617413
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23293515358361774,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012352507042617413
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c972b348c67d90979fad359dbb5bc512c836b93
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01225670860232692
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01225670860232692
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e97678811a8da82d59be2b55c8b7b20109362506
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22013651877133106,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01210812488346099
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22013651877133106,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01210812488346099
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cd8826d2c90b12a288f7debd591efbf1a1a3277
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23464163822525597,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01238387356076867
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23464163822525597,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01238387356076867
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ef46dd213b79d0c386078612497cabaf0a8e0f8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.0122728535825408
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.0122728535825408
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c0c019991bd5d22dd19cd46d3c354551c8031
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.25426621160409557,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01272499994515774
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29692832764505117,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013352025976725222
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7191175de80160bf78813f5f7120d7005ae6779e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.24914675767918087,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012639407111926435
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29436860068259385,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01331852846053943
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe748e762b818f59d704648d4e40c1c81dcf1773
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.26706484641638223,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01292893319649635
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29692832764505117,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013352025976725222
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..76535fffaee0f2afd100b90908931c83ecd69da2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.24146757679180889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01250656483973943
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2841296928327645,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013179442447653887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4975fb1c8bc07e8f14334334b1055c7a7b12d34c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.24146757679180889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012506564839739429
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.27474402730375425,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013044617212771227
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..412fe94f58501a36b7afe4c3bc0f968c3a5ae0dc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.257679180887372,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012780770562768416
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2713310580204778,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012993807727545785
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c7555e2f71ddc8f34064907c36411e9239c92f2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2516835016835017,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008905088235948782
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2516835016835017,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008905088235948782
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab47d1d489e17c001d29dab94c8a9cc58678bd0b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24116161616161616,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008778027378258016
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24116161616161616,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008778027378258016
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..448818ad92cf40d7e415a45ca7609b2fb8e7e4f5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25715488215488214,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008968394768971995
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25715488215488214,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008968394768971995
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a172c77c022f50f40edd3dd2a91d39dbce44b334
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25126262626262624,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008900141191221641
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25126262626262624,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008900141191221641
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..789bdfa4e531316bf3e7937e0ca065700e9d70ec
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2529461279461279,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008919862739165618
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2529461279461279,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008919862739165618
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..507e61d17d4605bf8e550e95c139c40923492065
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24915824915824916,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008875238553583176
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24915824915824916,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008875238553583176
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..806f5b9aa77dc020a9260635deaea504a0618242
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.34974747474747475,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009785578618940728
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.31397306397306396,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009523245335215511
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..672ae773438552710d3717f7c782694859171191
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3265993265993266,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009623047038267649
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30513468013468015,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009448531094163907
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..73ea27ab0deadd2082d6fd9fcd0a2402f5b4b7f6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3122895622895623,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009509325983631455
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2882996632996633,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00929477425202962
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7b7d8ea98d3642c8a7a70a3d132af055dddcd24
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.30176767676767674,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009418994158522527
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2887205387205387,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009298805565435518
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7295dfe26bed715571bd1c8af9b4e1631f6c1a59
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.289983164983165,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00931084097076903
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2857744107744108,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009270380606981212
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..18427be6daa119d60d9bd1c3f4d29739c593dc57
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2878787878787879,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009290733161670155
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2866161616161616,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009278551100969302
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5ae24a13ce7ba0f262e4f87fc7ba38e1055c064
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.27525252525252525,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009164888895174743
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2718855218855219,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009129795867310487
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..543140208f4f484562d04e3de0f29523660743e0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2735690235690236,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009147424438490736
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2765151515151515,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00917788010146828
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b49766b7271b76aa26fd5e96f7d80e14acf66c64
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.28703703703703703,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009282621598983076
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.28745791245791247,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009286682281593406
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5723fb82516ddab44fdd643dd82b2b79b3b3adba
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.27735690235690236,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009186490105111906
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.281986531986532,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009233124071053663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0e8cfdf9d5596e1dab9f5ee7399e907cb5cb56a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.27146464646464646,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009125362970360623
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2781986531986532,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009195059601583897
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..170534545e85dddd8c540725e925b0a48fa4afd0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2689393939393939,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009098548093009163
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2702020202020202,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009112002229119856
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc13f27f8d042213ae19f32672b0f2b75578d4c1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008910024163218202
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008910024163218202
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..141fb20a8c0d7e623da31b71e2bcfd55a7a213fa
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.0088449845819349
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.0088449845819349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cae11c87e6f74e97f3cf6497c1440cf17c108178
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25462962962962965,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008939407288589414
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25462962962962965,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008939407288589414
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e8f2beda56755814af94322357e0136a9b976c8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24915824915824916,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008875238553583168
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24915824915824916,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008875238553583168
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed9fc4fa37268b7a5aee0f638b13e0cb58f2c293
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.26136363636363635,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009015838366608193
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.26136363636363635,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009015838366608193
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..de23f0bd829dd49081d17ce0300a9e94666136ca
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008910024163218195
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2521043771043771,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008910024163218195
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4473c018b5610237993d54573b9883e26b6f32e2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.34385521885521886,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009746660584852448
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30387205387205385,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009437524848293738
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2b94929687ca1000aa173c383b41248e08a9e55
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.31734006734006737,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009550648343947775
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2958754208754209,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009365854134140057
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5efc5bc30497bb6695234624b40b49613e40a140
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.30765993265993263,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009470292575831178
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.28914141414141414,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009302827114597425
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b1b20160ecdc5f2af9806c850c3be551585df76
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.29419191919191917,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009350328648861737
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2824074074074074,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009237303403479329
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fdc7ee84edad4d56570ec04485e1058c1f04670
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2857744107744108,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009270380606981212
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2857744107744108,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009270380606981212
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..66451e9acdd99f7fbc90b1b8eda24236f080e795
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_arc_easy_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2878787878787879,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009290733161670159
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2760942760942761,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009173559873835257
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d45c3597d6306a385cf3e63e9d9022550dea4e4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5356666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009106972161130877
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6206666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008860362324722525
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9a9e984c08704e736ba8a4e9523233a0da21604
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.555,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009074825112195825
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6116666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008899620943397697
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f62ee83ba05c6f847759d6aa7c97cea7b6f2962
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.566,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00905033901089172
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6013333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00894075859420942
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a67083ac988f92cf8e8e562333a01cd37b832af
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.554,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009076827433934436
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5936666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008968593186211774
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a88dd38ecc2b60acc45cc64910491beddf3b5d1d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.537,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009105198233882231
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5763333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009023204169172301
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..899d4ce10f6415e42eafa9b1463e7376cda0e376
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5243333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00911941249154913
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.5636666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009055910870388477
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bacf4968ec8dce2d4314f9b6f73861961fce2035
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.6013333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008940758594209432
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee08cd8a464229223c8d9aba528009ee5ac30c08
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.544,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009094810160596324
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5453333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00909262640355374
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..489b3fb16b9329b0b9c97ba8b5ccddc28dba185b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5743333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009028770205053249
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5696666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009041170615977853
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..566fa8f4f89f9a002a15923ee1538764a661fc0f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.572,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009035073003655846
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5636666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00905591087038848
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c81443f83ea65558f693c2acee9507db3a31159
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5796666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009013590979636825
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5693333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009042024977931079
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..78d5cd1676e10d914a252ac4f6c58c176ed29fd0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_after_reading_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.575,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009026931658379624
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5683333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00904456300170546
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..60a85f93bf1b3694938e15a955322e31d110b5f3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008846558976258924
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1018362bd90900f90211cae10d885916e94f7dd8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5816666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009007620714085663
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5756666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009025076316539067
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..33774a88e477b08c3a87908a7c93467c31bef638
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5926666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008972056373066367
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5896666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008982215188519145
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..69fae74e9623f58e185dcd7ffdafcb6a2a3388bc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6046666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008927944837940476
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.6013333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00894075859420943
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bc6c7203b141a071d46ef115fed84f7b7698375
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6073333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008917381440148328
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.602,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008938230472973836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c576e8567253998404818d2dc63689fc04116e2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_exercise_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6033333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008933122315228996
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.595,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008963915658236378
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..de3ceadf4a3fc736ccbdc5f18e471cf46b80d2c3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5503333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009083851457629941
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.4076666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008973202213879664
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6900becb9a1a65f60b007c522b9e58766fbbf8d5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5793333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009014571254680415
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5736666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009030591966818142
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c3a22e15fdfa06556e609bb10cc97610eb88f28
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.584,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009000463105420336
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5783333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009017486788769118
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5920ce7a8922593fe20b9d8da7aee33f9845ce2c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5816666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009007620714085667
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5686666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009043721169619537
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..04eed7ca77ffc5208b1430d9fadebdbafaa398d4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.572,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009035073003655844
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5573333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009070008341418438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..12275b8690564878014dad085b7f19edde3bb506
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_valid_binary_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.555,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009074825112195824
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5433333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009095877403306732
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3d37da0d40924cf7f7a338d5033462652a85e85
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.623,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00884965755342756
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008846558976258922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f725574e9d03808018f267b6e5d22769935a203
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099982269204863
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099982269204863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7a43fbbec73f97a57ce28d975533eb3b111f818
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.5943333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008966262991425923
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5946666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00896509146797075
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7247f99fa2c595656c4be71a4d488a794d4822e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6013333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00894075859420943
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.606,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008922697920438169
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..232043194a00a3fdf799a7996c0df6b19ee89e81
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6006666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008943269429955152
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.607,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00891871708850756
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea29001b30c441862bec5910f9abea6bfa36d06f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_boolq_yes_no_question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.605,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008926639623340284
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6106666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00890378508047089
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dc08a3564a754022491ff39685ffabee1250c12
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359538
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.1940928270042194,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7191c3f8627733f708dc46f1d87dddf54f727cc2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813058
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.291005291005291,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdd2a4dc9f936be044761529861ed3d34d75c874
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0663363415035954
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.3,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..31f0f8826958f0c885c814cd7ef6c1f8c403c169
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.35714285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0646095738380922
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.29239200515796265,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1d9bfbb89835cf79d7f74ee42fe4367be046268
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3392857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06384226561930825
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.29558442323553674,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a54f061ade8e9ef8367dd937514f7fce6a14e720
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_GPT-3-style_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.35714285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0646095738380922
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.31511470985155193,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6e72355dbefe48ae955ca4d82d1a3e0b55d3618
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359538
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.1940928270042194,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce8bbd9642bfc56539c9d8b13229fc3bae758c2c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.31149301825993553,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..acccf83facfd63eb0c4a5734f4dd0827da30b07d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.33210150283321016,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d303768c453e35fe59c975aa314adf8c898097ff
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.29572649572649573,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e99a45116989514c5618274d24966011c906b043
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.3011063011063011,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f051196dc3aa7c5c810dc113332a73b6d9a5fa12
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5178571428571429,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06737697508644647
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.3561416013304823,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae2b796d0db614cf65e70df2d42b552d25f8fc12
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.27485380116959063,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2faf3e40a91cdcf1aec60216b1f2cbb94de1ce36
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b2580ae4b619f2f6c8009f6fd4d08351cd22771
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2930756843800322,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0cc3966e2830a458c53a44e83106cdfb79ccc0ae
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.28451178451178455,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6b2496c6e2e9d7e449c42bae8d9c95a76a140dc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813058
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.31015369110607205,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..56b609f35fa3edbd8058ea2ef249f2c0cf65324b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_can-we-infer_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359542
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.30233779879169953,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..459b0b65fdbb741c02b2d56623dec830b0338c2b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.2857142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06091449038731724
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.22072072072072071,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2587ee2f7eaed1ecf986a9554f423510bb8ab186
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.3808729165425035,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..76119a2becb4c6c169b1f44382ecec3a38461e3b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.2798452714419101,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..81fdfea2f9cef1489ff1f6ff1fa8f3e700e9f1f4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.3208410636982066,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ae065872cb1bc852f8116de98c2871b03b28f12
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.3494339622641509,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d35c7c342cb49d6db5cc6dcae82d7b11bf7cb2b4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942397
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.3299346405228758,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0fee4c40e66dc5be3be333024c3594d905ebf28
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2695374800637958,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0821bf232a6a05748b5107491b7b153039620948
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2c4d4744d1e9f60aaa849f7db02bddaa30a8252
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2824214792299899,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa4de0c7e38642380a850a561d285dce820cc8f1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.26587301587301587,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d79ffb0326cbf09850747931ec8f3654dadc99e9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3361048122952885,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7afafe7e666e064303a2bf6dafa928d5b0b57d24
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_cb_justified-in-saying_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06585388898066351
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.28699781799512253,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6313b3e28331eb2f23aeed153a6d695204a3bde6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.57,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050251890762960605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f755e4429157a5f75a2a900d1a0ea4a033dc7fd
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6acec1f54a4f561e2138139d4b379c276120a1e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05024183937956913
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.52,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05021167315686779
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..81b9382ba1db9bea719b45ab7be3385cd3e2c5ea
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.6,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04988876515698589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0d7caf866f46510b2dddd659602b26a63a2b99b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.58,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.57,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cacba0697a0547f32a5497ff238706298b8718e1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_best_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.58,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.58,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cd5afab6c36d7b1f5632e4e4d442cf7ce48a594
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.59,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04943110704237102
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.53,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbca1a457c815f4cdb9829437b21cac0aac51bb6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049431107042371025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3fde684cb24d8ab3614764f604dc7b21b6e5aa9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04999999999999999
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff810f49a2847d73999d530adb3fc4431d82e110
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..76e0e1046e99be8b85f77df94c2110adaf8a7729
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04999999999999999
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad3bc462b7d14239f66c1d4a1cd5d6721b960cfa
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_cause_effect_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.48,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050211673156867795
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05009082659620332
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce4ae3b30f53fbc790eab4ee6493e715e1ecb21f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.64,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04824181513244218
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8192bce74daf15a76541ae392e27f0cb5acc48b9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04975698519562428
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.36,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04824181513244218
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d84d65ef4502ad36882eb325e453b33fde8a8a7d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04960449637488584
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..86826911c22b611f4d30449fbf73abb36ddc7d0b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04999999999999999
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1894de1291ff7fe80ccce8bd332cb51acc08c336
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b3bf94000877d7786568f7b7730862262228245
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_choose_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049236596391733084
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..efb9eb969eb814385a086d83f2f4ee6196f73249
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.61,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04902071300001975
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a41c03d260a7a8e6394fd7a4cb869dc0fb7bf38d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05016135580465919
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e0cf4bd59b1371d7d02c6c300efe25215291dfb
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049431107042371025
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..04bcd1a0db3763583dda7e941e1554758e682885
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..dff1ecb7f017eb7e2392d52afe3c447ad69cfee8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04975698519562428
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..57a733613c841e07a507a006ac7bfee61b0f032a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05016135580465919
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cd153d2f94737288c4047dfac0bab773ad29dcf
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.63,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04852365870939099
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049888765156985884
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f43fce41544f48e98a0191da080f6308c2ce477
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.46,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04943110704237102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..53ad0e18cb4b8c7e22203051beb62f1b4a562525
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04902071300001974
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049236596391733084
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..96f6646f05cf399bdfe8910b6a172461e3d5708c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c40b18bc8277b00f690eb1e5b3c99c93bcb9d1b6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04975698519562428
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..65ed3d5986497aece234d691b0fc06a661c8a42a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c03139cb7a0101c4e0a61644b1fe4fddaa89c22
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 2.4291343231615197,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1081476953315167
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.30490376061705526,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0041264630846786585
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.28898151867581695,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003749068801492829
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.27814178842810267,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0034140931959471587
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.11936753561219268,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002002664631013812
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.10993101885288985,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017805025745999989
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.10747367819482016,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0016642061918427447
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.23846890714089322,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028662860213796654
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.2256288532111635,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026144614511853597
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.2171258274284509,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002322752684753328
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.27549124953733534,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0039982004120365
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.25261701652716345,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003253103645087724
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.24738399674674577,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0031658939531734086
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb747e22e69b86fa9ead843df46fd4e8b3909f6b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 10.367490688848457,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.147199946045334
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5083228320631105,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004117353888131676
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.3987167805017131,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00352094056862558
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.42359842430158595,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003166429715222895
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.23914610666636482,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002866994640346074
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.18408201451158962,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002299318168589243
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.19604007520987582,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022207416309622243
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.37329748745499713,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00342657052646914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.2897412832730678,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027697546621908526
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3087285163618671,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0025534325140347484
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.41731459418954514,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037508371487298972
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.32540873261888204,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030943465702578284
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.3463896607181391,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0028571074801555145
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3595c7ba4fd3b81a004d4209d2e53854c790e894
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 13.390236019438653,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14842576200349367
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5731248702131649,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0035369412980793694
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4540586945350603,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0031237260395784983
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.48107797071022573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025529853815747603
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.2886235972780352,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029184758928492037
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.22509950336148685,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002358526308667815
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2384229110806097,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002198745135162194
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4274682346753081,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0032262516803834794
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3355857923635549,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026048668389623054
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.35636457776206126,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002269467152908682
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.47792410828677606,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003441578862987895
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.37679777676824877,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028744498867330067
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.3999201210250597,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002503427292150924
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..af89d1cc35826a5b9caa4deb607744fecca07bf5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.11211105597524,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.10622153797928337
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.586920292942165,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033015565803634636
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.46387873389087075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029845335098343225
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4933918814621236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023507630543591544
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.296797885536512,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027849534966846853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.2328967005520696,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023744684508588587
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.24703844136495406,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002179120785142707
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4360600199796968,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030776623311689765
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.34248768718320766,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002574220770064095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.36479402136291444,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002206895287411414
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4892157265900996,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00324195297597455
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3863267663286363,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002838025198180799
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.41093370948084046,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024047099802087667
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..385bb3a1e1626dcdddbc6fc1a9bfa9438bbee3ef
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.231011749286287,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12209443108687626
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5917355418032829,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033192291366049035
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4649612536810019,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00292734750937545
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.496456725245094,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023467453451773064
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.2993125833156767,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002816066206715958
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.23258741498970592,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002335135354391098
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.24822901966358188,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002183872217474791
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.43644364494193044,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003043256161686109
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3423023863196363,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002561190633854322
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.36531375235590563,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022079073968463863
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.49113203678064726,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032253527956686463
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3864885531751292,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028011091689833898
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.41241929556327334,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024101261730711376
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..249192bd1542e9b57ceebf780b09aa1f69e0f4a8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_coherent_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.053424233811935,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2139098854103794
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.6007655467087342,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033178934942541586
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.46208545695254966,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002843128768795738
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4992427047391062,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002287441676423058
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.305416095977467,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002803489308110065
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.23208608723439855,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002287994510775193
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2507510925889971,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002153672477363239
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.44410464141299394,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030541617625096406
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3408331566739278,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002505664645443682
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.36829390265765766,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021967315271236207
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4996850800292637,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032601873043127173
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3845230626564954,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027439013265701992
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.4152808816287861,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023832871878115454
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..af1a8b419607d1461a2d346c6f790f66e1bfeb4d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 3.1714342318075586,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.045965301633825444
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.27892126811926726,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002600122184386265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.44180866598838936,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003143862734393032
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.3313708474779751,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026143663411672363
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.10968173222483277,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0013140525894327851
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.175414562274607,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0020139474282820564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.13068696654151762,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014575624518219298
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.18435049608419762,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015399304730552884
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3033010356499215,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002174783861918955
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.22199600069535,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015907140119287217
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.21103060760030867,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020453735196029944
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.33604504573169713,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025253980447102206
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.25095884676584657,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002058529882959225
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1770c592ca9d3adc493e32254b8416824dfc426f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 10.721803251704452,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17211225173321326
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5374028280682072,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0037863238755228563
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.41172220237757656,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032971954655288843
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.4412622742935095,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0028484776236360502
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.25277948098530284,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002827753217659691
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.189574321126897,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002209880786417029
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.20377550295587993,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021293945685583646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.39366543713570407,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003296033015397104
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.29758323325914543,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026078527492868022
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.32017327345810115,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0023605669683786938
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.44063300992380655,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003575829648604807
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3350879058384037,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029240248465231527
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.36000645986461527,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0026448711945656627
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d09ab16bf18bf79e0fe1ddc60ad4ad35e140b6c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 13.185638820912253,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17905937600833666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5847965205117387,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033347702831086844
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4531345534571351,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003022869025252749
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.484703373181249,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023713839952825065
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.29228753297166654,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028532818928023464
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.2225530720847117,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002304379004839278
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2381525007024568,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002136037647803045
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4343959678194726,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031186453492311928
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.33311897707583316,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025202919059291095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3573228133925937,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021528692893471066
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4847599016564475,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032967216795653054
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.37369553194499533,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027755064030562265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.40042732880195286,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002355966614225236
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f1eac103f4f228f3793263215ebed705cfb8ece
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.107794954151904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.07762803738740154
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5895720638586597,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003233593739284638
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.46460331525201143,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029286341256693055
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.49576437128816564,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023177278666779853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.29542191408821167,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002760426839218843
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.23089746509831843,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002327871040394227
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2459138034781126,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002162302420104042
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4352618522184374,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029819501191046195
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3413198904559014,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002518962067001467
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3645010870889512,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021578123182128444
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.48929479088617955,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031632938136278974
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3854980456003556,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027907143453587617
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.41121563779500553,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023650416738361323
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..63f2ca85a577e7622ee4fa0dcd12d7230d521a26
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.409703264220804,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1737734796397264
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5927894623799085,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032205049337101993
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.46958390044562126,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029192980510850597
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.5011786848332749,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023379416400234657
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.29855627088086,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002759091662895473
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.23459170627954518,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002340077345372699
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2502413808790468,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021979815342001766
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.43466325313168735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029448990645185706
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3431853955769687,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024941981600519157
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3664874450908063,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021711317825607286
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4921604150152806,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031835522336336663
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3900149073671336,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00278747643338989
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.4162953555843644,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024261965876490334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..642de2f8e7e71aeb18649808bcb50d56f3aaf6eb
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_create_text_for_me_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.36873102419111,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.24341981242466684
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.595927773483148,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032425483991447954
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4687061585590258,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00287648066845073
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.5021416185467042,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002304993139891524
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.3005779660822629,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027762341601524085
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.23417519742244133,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002322950198994913
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2507085203896114,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021883812851069395
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.4389314551113407,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029888231854793028
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3445034163549983,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025327223922618452
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.36918156449504774,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002219219514765805
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4937823771022341,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00318964228631743
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.38852149707353656,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002762325856795982
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.41611364077049817,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023925606250996302
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bd915004a3e0846084242a32e5e4faad85a1433
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 0.03400024051061316,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.003924009727004915
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.03163495022271719,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.00213402606254315
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.013052648219502593,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0007368912322178317
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.01489202936976875,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0007731390330539368
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.007676432368912842,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007079297568989281
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.005344496240226761,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00042498193988387223
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.005699997724184529,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004296769766697958
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.03128230881527702,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0021215996271714685
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.01278853369132827,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007111869202364915
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.014595759988507867,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007449503316530413
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.031288243684140286,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002125501401336427
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.012820813365160109,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0007273203927635051
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.014623667224382438,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007603892415303771
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c282922dd2aac510f6d41c848a92426f2f0ab9a4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 7.816534164583369,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.15565768866774704
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.35702422811192785,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004519300459158055
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.30333646003854636,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003920539462119713
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.30802363196459925,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003686633191390042
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.16461108331765129,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00270536053603032
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.13913189196694387,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022575199491451678
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.141349628450228,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002168592529358599
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.27136837128521674,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034720568847473434
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2277467839540949,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0029218597390161497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.23179943842474957,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002721088306573688
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.2992939969488658,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003911479322747588
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.2528327880900938,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0033438933458369248
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.2569619426281397,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003144471646322155
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..86eeaccfa0cba99d647dc56a3954a71548563b7f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 11.40727566060487,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1394149858863304
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.4581076639157755,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004355728871402772
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.39394217549623817,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003758657609304119
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.40125830728731005,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0034787841094553523
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.22093478501798178,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002899453214143421
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.18922034434713683,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002425949769422214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.19229296968051052,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0023147668235538378
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.3370847303471444,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003384862536690441
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2890375434227376,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002864993682553475
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.294053280095868,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026379563075927944
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.37869382337261237,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003810740310963444
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.32547687183919616,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032649447682233474
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.33140711720509614,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0030511867850170843
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4034bc5ea788e181af2c73a09ce6038199d2431
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 12.648284327714496,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1412239962827133
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.49870772839379046,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004079935779929387
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.42681008778722584,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003596695473457261
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.4375240256725131,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0032382818772945395
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.2429520126991852,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028473077920114502
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.20834090112824027,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002501461063250894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.21217343973137,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0023188663097364524
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.36347126840877136,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031832297343980212
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.3118840814238212,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0028265209782573778
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.3186046651652964,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002514178531320354
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.41125757524693257,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036259652798149387
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.35231454064715534,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031900363709328303
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.36066633891312805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0028985429124288673
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e31f80ce7ff8df52bcb66517941bcdd24fa4a52c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 13.559446770038464,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2304781739750598
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.5259656210972891,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003996774173806683
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.4421099389675289,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003360129610911662
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.4587518411625586,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003111470911283907
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.2604451621552076,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002891203306732668
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.21855085024457707,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002416176572188965
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.22594698540162944,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0023352863911978252
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.38041876582911677,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031927129913797195
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.3208557577808482,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027105884255468845
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.33156794470795,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002486646659841678
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.4323128551857689,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003596550511256332
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.36432376421020746,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030413424235041842
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.37728696967304404,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0028464369344208744
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0954c7680fe6c2d306770584a8798fa17cbdb91
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 13.545588426744118,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2304131026428412
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.5409229061537895,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0039794946613703796
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.448377942904066,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003294797070295318
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.46891543241081723,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0030504683364321462
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.2688654034164371,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028781695998310276
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.22137196577153798,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023525122978486418
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.2310792018618973,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022881465661611063
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.3878387871633595,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003170334663588811
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.32245201866172035,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002654582527964297
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.3361248025687532,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0024554385707299736
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.4435204322386192,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036158281316008125
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.3679817539692809,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002990071151094627
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.3844448063353725,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0028242027174122603
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7cd10935215d82f9abdd832e7b938a0583d41ea
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b877d8145af536356df2e49e9246143a3513149
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 11.807245592727982,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14268734921118004
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5694397994427649,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003239379072871242
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.43330345866585573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003044911625941733
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.46557148222586153,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023657331986333577
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2723840046490374,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027516718471111436
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.20361914910986256,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022039744834451503
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.21912991483763275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002069315036688718
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4186095218334134,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002968616014528494
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3149916119112548,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024715985756080345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.33955869538197114,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002087373657004864
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.46699348061463236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003191007134428128
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.35381509149376406,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002783989467610118
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.38066023518841285,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023459300318536126
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6b144bbb066febb0c46d18b93f8336cec539a71
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.207475603022154,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20911303585808974
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5838609934278248,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003219893788304826
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4676453406199286,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003012959553547953
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49425084203142144,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023454913129034973
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2920964834179201,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002749941677148143
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23229590203180275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002340599122551353
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2450476239671336,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021622871202404523
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4326595154400953,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029553909822487533
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34449565433901846,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002541682059011724
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3646666089186555,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021523804779859127
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48625392335924433,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031665719502181147
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3893023276736312,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028360977602737624
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.41151419710242576,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023890835672487213
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bcd8c63791228dbba0c7ad4181bef6a1f633719
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.881355021152629,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14798982038506134
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5813338263183422,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031068563548781497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47562332710532934,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029492958281890696
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5001074662527342,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022864719896120603
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2933839366604696,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002656083464844314
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23916505911615024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023564329861628973
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2506081474778741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021353634161434465
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.42886203590947314,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028648415383139367
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34942359580045534,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025299540186551716
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36772667157315414,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002137446714056548
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48630515550742015,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030844914644101774
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3982270500785823,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028293510736079205
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.41854738428796234,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023767509258669573
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce4629eb162c382559f8e035d870e6a37ce5e656
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.161167960986678,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1434464937691823
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5828889415157229,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003147155952785506
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47816618968937713,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028905507376413963
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5031994429048829,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002288693420421454
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2947392128854523,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002705329023608755
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24017998180506026,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023323057167825125
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2523951605760761,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021595061752098924
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43041916996749047,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002887136906682548
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.352331798953218,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025196691672468205
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37084325448755734,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002169564896258564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48846331616257255,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003116275576376514
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4014433365412298,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028080914024126535
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.42211119537556474,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002395568997982313
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c263bee4edac771ceb0f544ab48a204424cacc95
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.199579399737006,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2197710642602159
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5888360095441599,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003159796703393573
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4770950472138781,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028673153056796336
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5056858022442728,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022854558866388314
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2996637182544266,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002755313978179728
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24042124929715428,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023404910908957914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25480077699734965,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002191468155403316
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4337481481034359,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029181057449224846
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35012226658269135,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002503515068580577
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3714282516661889,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021747599500282675
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.49452192644513626,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031483071698951075
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4006792202843592,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002780394897574505
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4246436350463985,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023956383797766666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..73b4a91fceddb57325b69de8d0ae54e18f18ce83
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 1.115288380550811,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.03584738835397564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.0481634098481736,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016087097616289147
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.10707802462357018,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003385941066987629
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.06478686781175597,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002096643728341116
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.020012235219458552,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007339674775155459
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.04536128791284365,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015985026619880938
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.02703438376489087,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009617584924329875
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.04072107560821441,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013394434625095691
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.09210140922870967,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002942849426224985
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.05510083034332008,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017722509136964503
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.043583758115497744,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0014653987421120727
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.09700126154529612,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030955294628613847
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.05863724251471288,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019106515434616845
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..de543eaf6964d2988e7d86970e5c61796b2bd20f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 11.190352808928104,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.18141915137411
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5677445861811545,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0033984313369014468
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.41532706024078114,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002917300932047345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.45251036431033176,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023124683743620694
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.2744861149641647,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028946434469851188
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.19515249470299148,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021250810681362133
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.2134831249471247,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002033720545474901
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4249656266902133,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003190196797303736
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.30648054044850537,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002403100236029252
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.33535417773592563,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020836745399405625
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.469797154283937,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033750646684610054
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.34104989274687575,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002659577970371275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.372491915618829,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022926855668308236
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..52fbac3593e753d825f2a121da6564fd72151fdc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 12.679562321227413,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2222126775351692
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.584577354262805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003347858172867005
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.44205456550510863,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028911343461195844
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4781335252015599,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023177131734020964
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.29090406263628454,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002883255237867255
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.21626343858582678,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002250184676673601
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.23417338269491117,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002138781746654695
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.43779082304044564,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031428137257300858
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.32836831949072104,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024657583395343735
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3559492850424143,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021548825504076837
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.48637793220824943,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003319535039264522
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.36657855951325957,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002716088252466532
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.3969389628127764,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023548448674264918
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..09a6190b015af751c9c3787ba1096d07f6e4c92d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.302937463565117,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16122159664144584
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5874865856088282,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003310089916183211
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.44727502252297524,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028944779700014653
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.48368846020707407,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023223560319026336
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.29353842921389267,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028487712438280198
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22132076463133735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023142648024014117
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.2390946966736538,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002182786299360237
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4371504816750144,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003083689894416344
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3310191386275968,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002498138917180889
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3583981701407721,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021794562630022907
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.48871506210624377,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032662459958388208
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.37171338204559984,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027596290462697038
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4020210227451716,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023835574293658724
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..64035ba1d2937b1f103c6d3e5234acc4f40b5ffd
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.433876035255205,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11981015230222622
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5876353731004864,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032800870923946806
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4488101154572819,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028867381201944163
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.48569076033953423,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023257013392330443
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.29256227650092403,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028389393416033155
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.2211736710146012,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002322811471619675
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.23902637874509242,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021849331256108503
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.43579481435548406,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003052897338172175
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3317240836685784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002502354705544728
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3591529229249466,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021946437964988253
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.48827906155886375,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032373730408784213
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.37332408627485886,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002763882689822959
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.403767269262012,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024006083640941635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d1ba3eec6c9a186f93d2864d617e34e74f5154b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_e2e_nlg_cleaned_text_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.506223827879625,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.13712432815643727
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5928496432260469,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032872080307399898
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.44991984920712536,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002854639924992808
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4887998407141921,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002319156170252818
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.29918813275057654,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028678438572694092
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22416121461467314,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002312463482767483
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.24345616505754553,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021918190877205047
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4417618550309857,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003074003692942964
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.333769883964141,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024914760495667993
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3630039450500851,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002201392663162804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.49454649911344245,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032669183780217074
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.37523026419685435,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027419119193864048
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4076162949696335,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00239783693691282
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eef215bbcf6fdbb0608a58086f1ab4b83a961f67
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.10961523878541282,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0015892800805468758
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.2760154067129983,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003714427703127575
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.15501116522406902,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002143767701315978
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.016503385727741075,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0006937152978442476
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.04290433837786585,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001838510464413981
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.023537346012547193,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0009840202178341868
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.08565381552764036,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0011225671784216172
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.21734942060398876,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0027643020863370284
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.12136447200761914,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0015209692569129127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.08820687667796023,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0012712858820566506
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.2240859928005493,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003132931618671561
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.12504245478102102,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.001738309343000351
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.8131130851284528,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.04714338380259805
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3387280b8fe8d17ae78804c0f18e56ada9ca2db3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.11659718760405827,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0019742524127726855
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.149104507025026,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0029509495609674774
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.12255449947246683,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.001955813890814555
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.006027752453637144,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0005191187509380493
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.009307727994348461,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.000907548674614976
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.006748722038979709,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0005881345903334706
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.08952779755714778,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.001481045986907457
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.11272020580989889,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0020202289653695653
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.09345694256222935,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.001393697589554955
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0921370778595584,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001508772594316064
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.11881744272884845,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0023731154530475876
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.09701653702411298,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0015017590341849362
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.4318908013687936,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09340356040913948
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a55cb1f4bae42c9e0e6fa784b8769d08be95b125
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.16677371184257006,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003263851048544926
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.1865625419381177,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0036738567366226727
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.1641766923037994,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002787542118343757
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.024073436874321924,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0014755313251919172
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.027534094582926417,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0015892930983602675
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.023361198351848255,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013103167812832366
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.12573595130504708,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002419342312394539
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.14040929192793153,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0026694063784176297
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.12354629572040322,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0020211011832145563
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.12809116280754843,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002429412800673777
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.14548807960651497,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002942315760362548
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.1266583625631351,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0020977751126439147
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 1.2185124475941114,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1247918930209988
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..252b32b4949210c550343c956f29a946118176c6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.21090220523104927,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0043400296722759715
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.20623055336862026,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004117296972903804
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.19540218617384542,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0035981591988257
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.04446812394522324,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002433325283250796
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.04330763423954221,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002130444343579905
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.04043225200079106,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001971128849289818
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.16102102863101164,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003522787509763259
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.1562623845114745,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031877827820907226
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.14823954382505022,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0028266046266465796
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.1632209386021339,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003534529597444769
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.16014367488766096,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003360192272089281
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.15084505474710264,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028701096981838467
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 2.384083581683153,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.19545490163541676
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b05ca1e5de2532f81736b2029e37419d7d9a735
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.05924863405775099,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0037430888437938997
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.05315091802828831,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0034485744515216376
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.05206730499634627,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0032224075534875705
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.013141821010449908,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0014263480788354573
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.012179899285613686,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0012588083212497355
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.011639585528160717,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001181669731295198
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.045593974139523485,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0029554838918758393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.040334899215324675,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002664958357606189
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.03966046102164665,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002501900408036937
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.04623428184973231,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002984379602534541
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.04133658908689536,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0027627667262542506
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.04033802381773822,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0025377176339826104
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.09150173402093377,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.03488801022859231
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9873d114aff0e70a459fa954c22b08193ab604a6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d77f007ac349da5ce5a6dc6eb4a2b9ef99d7902
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.14342162008284204,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0021489787880454073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.32838306815833007,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004520779915543697
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.19594002271344585,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0026553778162506795
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0324041203368683,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0013857024425616468
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.07633647234365959,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0027120097095923
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.04383970111779196,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001502517751586814
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.10834989026863737,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0017010407378355936
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.24852399538651415,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034523875710231393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.1477613060651902,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0019730423409828144
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.11337131132440753,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0018502354074941042
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.26102578003284527,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003905320793767065
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.15488179011332442,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002226328551514201
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.711960179300553,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09364733946460982
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..649b02749fffb23ec8129a154d2f6b99651294f6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.218502654216416,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004216701848226965
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.22612472735288322,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004050424823157234
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.20735885392080328,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003416950171252691
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.04666208027558217,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002339875983008479
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.04728243481580427,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002144678600899603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.043166660515364066,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019489298067270461
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.16554354693857987,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0033634278225486212
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.16887951213275842,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0029798022422798257
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.15586444152297085,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002611115681188433
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.16802010031692538,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003363169417299626
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.17423319642542318,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032438974470327254
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.15904480903777268,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0026546734966954384
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 2.038892604450971,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08659921775202399
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..13e3a051229bddf596be39dd49709b4f94d9a39c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.2686202317908267,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0043807329681565515
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2442620399832786,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0037187930025226325
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.24502315675229497,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003527107028051239
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.06761930566167147,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0027582692970284004
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.06045947601825875,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002402252483561905
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.060811060789406665,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002362985597929733
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.20701714470631166,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0037001369903729023
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.18782901699110188,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031305668524696704
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.18846932992317159,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.003002336243674728
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.20779734909360606,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0037041900951829566
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.18902539617665143,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003170681415495645
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1893130337834815,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0030095150279783
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 3.4680465130026668,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1806305418260011
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..365264b3a36dcaa57a58341e33ad992b525d5f6c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.2704616336812799,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004785011358046082
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2338132083832053,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00393975832858179
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.23897677492951283,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003816266861615919
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0686867366515392,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0028770176560848778
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.058824913888813044,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002334355572633918
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.06053382888543694,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0023807363425603057
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.20739899599482822,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.004103033934906544
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.17693699968227222,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003197846899502375
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.18162079511431126,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0031790525970022214
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.208443470756841,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.004123726328746496
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.17832743614767127,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00324196291285183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1827225485802873,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0031988935917358708
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 3.2188873987668196,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.21129832875336893
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..27949d4ec486ea53592587a54a654451739d7e22
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.066217183773399,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0042555829917650285
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.05213679480213446,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003385771564869977
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.055389768861489226,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034919433178371593
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.017664150375129962,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0019205293888410434
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.013372101380357073,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0013831809690219824
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.01416209803140751,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014186880826512003
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.05106781746182636,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0034158251072776654
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.03979759120276381,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0026778266516859447
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.04233447640190309,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027559578153928876
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.051403778989441484,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003427827319089082
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.0401050806117895,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002691735875078422
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.04262294039262988,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002766382698213603
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 0.04659629275993456,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.019558262715467265
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..df7479019e168d223841c4b476eb2de89689ab90
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5043111707438f6e99fba55c03da6eeff78cdd26
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1442510190731307,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.002377555886882021
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.32728957236911,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.005137969407531113
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.19658526341354268,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0030181310698989858
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0341581223240638,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011714333255302305
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.08176991191725373,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0028112899026499264
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04748504386381342,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016110390499460413
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.10623987893735275,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0018433462761128848
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.24157333634286496,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003878678496036975
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.14457182138352206,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002237374752785832
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.11372296782139678,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001999008147870968
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.25936158541657245,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004311577104350546
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15498205908882362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024811396605881834
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.051690852375094,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0919729448596882
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..724b8d3cfe86310d48d5a902370254cf07f1fd48
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.21346282259038915,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004034521833789361
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2499714791367549,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004127977259047798
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21363061590069807,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033936589765782323
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0463678167671605,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0022751074397479123
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.052694807564007726,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002334059403839543
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0455477081115806,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0020625955375166345
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1620749435386503,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003255513503258099
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18819404884414812,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031630393650993625
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1613719463559732,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027032991504210456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16522714514641704,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0032476758014330694
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.195040375695118,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003405905178416842
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16555112242432596,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027468382198912893
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.019074157200998,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1854543722412079
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef8d3a00737c54523ac3cac3788bf65a76fdc17c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.23937462775757284,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004035761510891073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.25228250851051215,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038577616272618006
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23064419705209097,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003326199002008987
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.05306913769010263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002351977805815094
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05441248374509546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002305037996521187
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05025303080761008,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002124751664870315
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1812036165079183,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0032724786112898127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.1907813012976707,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030970666700906747
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1742531731484881,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026993941796824017
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1831569851914895,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0032779593192606743
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.19469605892775696,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003266640672239043
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17671769275213156,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002734700050326904
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.382425295806561,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.185437417569327
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..158f496759455052fe55d3a175f34734afc3c4db
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.24352976537606963,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004350047350541235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.23908129808521703,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004002524315183232
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.22778807117999614,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0036243774679184364
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.05729292572552575,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002617653205209828
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05451017420559761,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002306325013293872
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0527365635483454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002240194553760494
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.18538163733792024,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003600645888264381
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18116114634018265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032128881913064335
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.17283311820835626,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0029786990259966836
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.18664238451530393,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0036151543248168095
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.18367415808096732,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003324929336987573
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1743815051388937,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003003683858669049
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.8525428900629293,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1568990466406635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8dbfe332891d765b81767e1662bf08e465f8970
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.06761052931830998,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004168524274964794
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.05943339586169317,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0036304617540668326
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.059542504834570575,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0035683053141444022
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.01636682763621811,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016719863572741598
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.014189933840370804,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014657982637994635
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.014234207773434364,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014260690361126657
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.052604784133526844,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003336410205451067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.045692475599885846,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028314510568171344
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04594672257630698,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002810904537853349
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05281145831744821,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033495714563258135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.046242631863418045,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002916590491225937
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04622428003631112,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028346596378458684
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.15163352455062296,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0413547235971038
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0efd886b57c1bd8fb51598ae66447f081e4ac7d1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..70d4468adb88c57a8211dabd6ca8531b88dc2ebc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.14340787798544366,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018032543453857495
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.348079749408104,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0041382877901599245
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.20057647309126575,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0024023673762588573
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.02767775271537802,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0009721356559807497
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06937497981951246,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002453319853871487
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.039066544296982385,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013626441060990537
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.10053424559477453,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.001277020654050582
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.24542491643370598,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030533732704981437
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.1407817781383674,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0017158350976538461
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.11420554133997407,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0014870409134011945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.27886116865239546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035333760799287008
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.16000303452385659,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0020035707234822052
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.5345722016857484,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.05327551375382703
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a448e8c11adfd1203ad5db7bbdb24f57f6d073a3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.19865216741377603,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003516512262635199
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.29135534859568224,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004246772523997126
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.21705829457281703,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002959231545359903
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.04118386653309974,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0019283565176599058
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06151103352010455,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024532430125831714
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.04491172266106288,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0018380470883612547
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.14735138093875785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0028518722500306447
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2146519342508703,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032903532550251045
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.16019371252800158,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00236853277069081
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.15275410364710615,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.00284109895626164
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.22737937708782943,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003605645003436984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.16759564345626984,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024308191123714536
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.9172311763738956,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.16322447855789024
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4424cb26a07dde500db6fcf8d341e84a19bec0c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.2465315700047466,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004175970619992801
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.2744895062916164,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004132633230322607
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.24156257829126068,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034116082967927393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.060132830131985185,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002584710303057828
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06493647553156313,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025380809952402056
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.0580916953307158,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002306895150723321
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.18689848183823216,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0034836019281092594
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.20621460918831602,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00329546339611092
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.18229586646296705,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002854614939760979
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.1907928929206747,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0034476831058094398
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.21471338674155627,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035611110348475897
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.18747549293928492,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028727564091906572
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 2.704613633746308,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13932816628204614
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9ba0e48f0b7b8fa95e0044101298f8790b0735c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.24488137416704311,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004439983550252802
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.2547015400524934,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0043629817539744134
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.23419955703690729,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0037544069015259768
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.06045943164183697,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002503136670039761
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06245888691656072,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002433600168461948
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.0575651168139506,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002238805869014741
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.18511602065268223,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003675578589755922
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.19090721673096625,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034376644701036387
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.17601063162717448,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0030542550306405063
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.18745473520934786,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0036729872607959803
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.19536859046211874,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003598817184826712
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.17894501465520485,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003087597155662158
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 3.089437062529702,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1827268409396636
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a027237c158a88f9e271a2ed857c69d60e77f5d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.07353706957375673,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004426917583408668
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.06276117005276965,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00387156032277117
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.06132087500847303,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0035658659998728734
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.018229014909705746,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0019191437419252921
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.015796133646357152,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014959398218503275
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.015003585358323187,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0013899419336256885
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.05725219630577079,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00361985637625251
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.047238026145296305,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002971356877752647
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.04648951924651044,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027574847885540104
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.057917410235608965,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0036412515752914593
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.04864579563916333,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0031056015721909335
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.04735855139435085,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002806718999058402
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 0.19070979838812588,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06378186049279796
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3faf5fcfad02384df4f5825d5ddbe8d87eea4573
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.002549776342784112,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0007166030165886934
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.0020413905595106064,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005594890729416889
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.0022287175815817136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0006161246419501403
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.00038085758119054455,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00014461738794117318
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.0002657839755509012,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00010178048880223215
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.0003119009542132786,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00011884895101753975
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.001898994303267333,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005358267380604738
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.0015046723091027433,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00040639938290870474
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.0016459691843579136,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00045147423875777586
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.0020030453576513505,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.000560832569927041
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.001594104620308841,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0004280320767214678
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.0017415567558494514,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00047483426648456086
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 9.871985357472749e-40,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 9.721216135694307e-35
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d81624eb433e42e58ea7245fc0005beffffe18c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.15035747550854134,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018537172978333066
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.35742314169547235,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004326079160564529
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.2089918826583484,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0024707731495359864
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.0349468617386321,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011091670418172431
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.08661816478419919,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0027998243524170996
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04915426636292421,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015546014590166785
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.11031701721659484,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013825633108291136
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.26338148416281726,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033183960564176712
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.1534641057893518,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0018542646706649216
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.1191235584341918,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0015439232638346807
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.28484018718916787,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003731890576938668
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.16584195681107322,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002084501252740023
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.9294630165344384,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.05199953236184783
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c2200c219301aa329c87fe3e230aa6c0dab9919
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.20762450013236594,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0039461291351093625
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.24413763817790524,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004174823772437886
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.20791172933370228,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00329315292456849
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.043001413856892226,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002203182319482167
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.05070781231089209,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002301401448382558
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04260592446633687,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0019713326310437816
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.1562927305009076,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003049742520195418
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.18293962617097087,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031269642052189203
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.15602073715303139,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002503729351832688
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.1593551971386879,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0030732593775087977
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.18915937658696833,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0033947710998175912
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.15996751955267816,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0025845243156891263
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 2.0042439168055477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.13033809653477546
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..540de19754f6d7c6de57fafbaaef64d0a4b768df
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.2359424064551194,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004094026469321015
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.24137156152029446,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003746084943800122
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.22509114147071038,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003359349736460282
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.05084069401291977,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002375545332584951
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.05062417462546563,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0022092158246353067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.047784340461202085,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0021244464358628947
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.17830347583335351,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003301027358370855
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.1820807332453224,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002954788303945269
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.16971181352967674,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002694937124690388
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.17986687172318414,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0032949814705597958
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.1852104189778644,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003082867447381655
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.1716985031231568,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002716095074624316
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 2.3963330281306017,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.10229956742443976
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a527457311750ba3300ab82d07df391fa492cafa
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.23537962078639738,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004425637093001513
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.22790233981678285,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003956303010732718
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.2195806592889226,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003671944538669731
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.053206008686741926,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0025279299018083337
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.04971780973026118,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0021622161152509266
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04843631029051122,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00210059083909135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.17726440409046024,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003611471694185854
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.17124218344005204,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031466317030328515
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.16496863705556303,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0029641595071464256
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.17842403243803612,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003622742304784544
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.17318872190675308,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0032217666739748746
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.1663330953913527,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002986175252170523
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 2.6146189500481682,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1536987337730129
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..58ee076f5dda61ecf4988d1c399d0d2845b61fbe
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.06325338928249166,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0040370124259995
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.051302455688280554,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0032918601370581302
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.05273405652690356,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0032801143050456877
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.015839424272271533,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.001799340153660356
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.011885180583528233,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0012328784003733852
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.012504879614731455,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001294899307586975
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.050331670340206996,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003309508915566947
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.04029477500488443,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002631169598444604
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.04150375469427366,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002629344620451854
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.050554573055058456,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003318947794768516
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.04070306392026555,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0026745492387180727
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.04177261698814985,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002646732117471144
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.07355919987680871,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.03017235561587739
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5eab18965627f7eae6884c9cd4b93617db2734f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b5db523f4fb495af130ceb2805afca00354a201
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 6.06565649817244,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.26580129636863487
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.08236949927702025,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0024104850431885413
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.719480590736069,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006398515591482853
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.1346828076880258,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003005110263614226
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.0662606810581365,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002335185253764046
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5581501081874023,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007956342433328234
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.10761118468871383,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0030242837703836024
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.08166459222277754,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002389577144028269
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7162201035906264,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006438467461215888
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.13376588963623146,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0029942137429941505
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.07995628492332586,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002395303523446372
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6988153667130336,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00667405770086968
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.1306994980839404,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003003875535784151
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..72aaa33320a77066053ffb6448f6cbc117268e42
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 63.26037952408198,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.951804882148515
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.6988575984790449,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006588116693745785
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6740866889600351,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.007010681242613929
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6727480193548931,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0068474612270140334
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5527918786636355,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007922884516561693
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5405879983491555,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.008055493652125685
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5392222989862876,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00794919143323893
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.6841121597165629,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0067911100878414125
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6624258059575598,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0072087579910821735
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6606953549883641,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.007053863769326496
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.6873985752009085,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0067517698213240395
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6643687405464695,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0071647281967591485
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6628897552319865,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.007007884050491445
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2ad36e2a3ad226c9a727e4a145d30f78bf6ed5a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 65.7787599516413,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.8805278393831618
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7160937126971031,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006316259371960246
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6983512547114311,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0066631774166225404
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6960318899051244,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0065150837518839376
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5735137941726731,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007734486272816341
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5639314795514925,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007873750511765216
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5618897276378911,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00777775543365932
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7024300295876501,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006522310714591612
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6874458530825792,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0068708930660312095
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6846881504443719,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006731291659364605
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.70510042853508,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00648084314614621
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6892365545992871,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006830029581115679
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6866058410510959,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006688511386355791
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c26acc381a37643bc67aedfc403228587e64a52f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 68.79659105137841,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.8911607307587284
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7281768854202323,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006091328345208682
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7187148914356442,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0063093252000116454
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7139591788200526,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006222444811463132
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5895951227185078,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007589501024665014
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5841983766424242,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007690796430733763
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.580739926062922,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0076152906715177895
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7152369828707047,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006320392771571204
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7077083520073133,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006545042418523855
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7027473285080598,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0064598423376290865
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7175001091151413,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006279054505869695
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7093908666073235,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006503238973466924
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7044741566673249,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0064178648210677725
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..20b5b387a2b26c1e29208364b1eef2c942415c19
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 68.89024334929532,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.0053364628892096
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7283756240423309,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006093561817243837
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7204913392251014,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006274653215078908
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7153440029217795,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006190308119095787
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5902361563890636,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0075450855599918385
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.58542613380595,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0076389718894045245
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.581967194437133,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007569458361483834
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7155444610558683,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006310523362454862
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7093415097622576,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006503769590635487
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7041497846834553,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006425679507241933
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7174324105961624,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0062787061597621655
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7109083354996658,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00646856437751123
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7057056561042719,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006390567486985987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ad5baccda4beae63edb99cbf6b1915f005a24d7
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 70.06736987950548,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.9767815709778029
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.729594849228905,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0060710557444570695
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.726939841836775,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0061673276257683965
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7195026463671974,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006122247411756088
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5934420624153478,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007521177536614038
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5917838640682289,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007591836716456811
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5869778194049874,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007530370937645896
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7181192568693671,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006271435942895118
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7166816998141146,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006389531553039699
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7092419533890473,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006342085468956855
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7196391201464266,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006243283678540828
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7181584658507878,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006355241979175212
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7106557360440093,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0063101236805124935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fd030818ecb5199a853d1628a97a17063e107c0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166526473007815
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166526473007815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..67d746c7b6812a7c9b5a4158819b6feb35c6e51b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5027203482045702,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665651503000727
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5027203482045702,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665651503000727
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb0d842ac72202fba24d0a500c8b0c167923aa2b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.514145810663765,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011661154475524838
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.514145810663765,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011661154475524838
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..976233e39383e28150cc67ea05eb986db0b8c5c8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5212187159956474,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01165531473228886
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5212187159956474,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01165531473228886
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ca22fb21f29aba70fe5335416a124e46a2b9710
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5125136017410229,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011662170084916896
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5125136017410229,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011662170084916896
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b4a56f1ffa7b31ed63baf96f7b782a1c945dd59
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5092491838955386,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663828032649181
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5092491838955386,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663828032649181
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..efd002eff0430897f0ddba9dd6f54a586aff6827
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.16038917825535667,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.013645183773152294
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.019717407526919313,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0005241226898875045
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.21991235718003924,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004124788414398084
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.034295347974031186,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00080874557662922
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.003053738189335298,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0001553041432895281
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.03824249021088632,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001975265638985592
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.005371789507833525,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00026020055238192285
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.017812403761871688,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0004288523322774551
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.20443665582319562,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0038249367833209564
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.03113183301172919,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006873859750781993
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.01619119548364551,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00043471160357438384
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.1891909150799779,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003730983463664777
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.028214906108285757,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0006610920318795323
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7448626fd3d41e401cce890b7fcdc91f996e1397
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.2751691068568516,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.013136219804520673
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.06885210549637656,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0028535182012972664
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.17842265399629653,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004223503540841405
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07214164014313752,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024508893541148966
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.013821947965005339,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011850586486327943
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.03344723252560877,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0019089636877996823
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.014805868732602081,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011349585009864634
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.057690189785966156,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023735730007109603
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.1583254724810553,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0037922699239114157
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06141675468189888,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020741009028569673
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.058586296430585875,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0024669791370603633
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.15379774689758435,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0037832161320439433
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.061260958770885235,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021205972969045098
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0e3e56036eeba1809fe53b136849109bcc5a037
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.7478592848030261,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.05206294347980578
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07482884032327494,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003612480132983617
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.07821116773610592,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033314055394672555
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.060146538447062185,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026253423803596007
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.016468959770221926,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014638792570753052
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.015742688935978556,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013085844552320407
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.012735175881541076,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010739021956960627
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06379992803442668,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031121495725695463
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.06819680109978733,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002969336578287296
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.05160822400895001,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00228266908781442
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06657979060268567,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003264654543949671
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.06934836838932225,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029951702164645205
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.053315918135572495,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002359636769296838
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..818755f91a7706ff5a069d1e07da0108e060d3b8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.45426082881899243,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0606143505719565
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07290254469938463,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0036333756842878845
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.07008239737410384,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003310749345271666
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.058422577709111474,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026946026782868468
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.016529422242712644,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016568372778018017
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.01616429994036002,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015418921009779543
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.012703293081847745,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001197207060641643
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06262930735049298,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00319556996686759
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.06161665158045485,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0029931784245315683
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.050359394410487104,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0023702453331618492
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06499144598728437,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033179552370669625
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.06248884178951301,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003011841506938599
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.05173144445865948,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002423125428506434
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..91024fdef1b7cff02ea6d967a4bbcb6a1df0e20f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.5257099514220486,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.054470098489529585
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.08433502515888319,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0039111129507774746
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.07755475127098414,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0034502441379280794
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.0674817786636535,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0029464283746983093
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.020612038094728467,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0019179044390943054
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.019342108926184154,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00173678799008175
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.01642293331569968,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014739545404562904
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.07275740895813725,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003451903236764541
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.0683379924797514,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0031386155388587558
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.058732697860771205,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002646209414844525
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.07589809456013663,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003610021093780004
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.06966112094912398,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031716009555262064
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.06036538574416117,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0026965674415596326
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2ac56e646e5a79c990a890087c9004cfc534bf8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6100817469863049,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.07611933506009459
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.09731785854243621,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004118827876530802
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.0909234801856218,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0036289176500432993
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07895885565036868,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0030819472178692767
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.02311349812806924,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001974642909283926
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.020134835356896908,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017387345774011085
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.017754410094936536,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014564164985431847
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.08330336586514235,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003613644785911595
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07943627989992699,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0032943078048450745
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06801592952723212,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002740700561112419
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.08686183044375809,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037620960553600316
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.0817329947639033,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00334337780108826
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.07057923291607501,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002814634535587915
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..60d4bd6d2895459662a28f72ec2c760d1bbeba2b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.4961915125136017,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665485744746797
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.4961915125136017,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665485744746797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8827837778d5881dafd05de2bd776c5d87902f51
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5038084874863983,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665485744746797
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5038084874863983,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665485744746797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6912e99e5a7e80589804229c0552fb15d382f975
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5087051142546246,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664055982032837
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5087051142546246,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664055982032837
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5fb21f1ea30a577955dd0e0de9296a99189e581b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5097932535364527,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663586263283223
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5097932535364527,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663586263283223
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4edcbcb293c5260e1421bc9492e1130819cc89a2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5108813928182807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663061261117732
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5108813928182807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663061261117732
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..843a7c6b6dfe3e5cb2fd69b8a8dbc09e1fc858ba
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.4912948857453754,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664055982032843
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.4912948857453754,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664055982032843
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..47141b0239a8e19eec12f8f1f0f5967612eff93c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5658324265505985,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011564264866016057
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.573993471164309,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01153737544851943
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cbbab86bcafc951e8259c82d0e3012558634534
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5636561479869423,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011570895640553714
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5674646354733406,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011559142916063145
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d10406b10cc68c0d4c619b9b7ff971806fa7c31
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5554951033732318,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011593746871584154
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.559847660500544,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011581954727227395
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f927cf3e663a62e3ffc35cc751d94f37804f1af3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5484221980413493,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01161098935881427
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5527747551686616,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011600659443292917
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bad00bf22a85a32f20fedcc348e8269a320b0cc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5511425462459195,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011604638382510184
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5554951033732318,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011593746871584154
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..82184144fb8058fca95407f95d7621742980167c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.545157780195865,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011618148261187405
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.544069640914037,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011620422647622242
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..84809287ef4b686ca3d92df2801521bdd3c16678
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.574,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01564508768811381
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.512,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015814743314581818
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7d52b6bd247078da797606a7c4eec2e6771206e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.652,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01507060460376841
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.632,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.0152580735615218
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba4af5e8c8efdf3f675e69222752ad51b4f167f2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.656,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015029633724408947
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.646,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015129868238451772
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5eddd11334df961e9088fdb063f6aba9735b7ae4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.654,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015050266127564443
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.653,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015060472031706618
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7370fbd70640c493c2793e90a50b652d69abf06c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.663,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014955087918653603
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.662,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014965960710224482
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fac66fa9f19c4212ef17ce8147301caf3499e1d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.659,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014998131348402707
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.674,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014830507204541038
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5aa348d9604fe17c2a554fc6b9dc34af3b16ba36
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.854,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011171786285496497
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.767,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.013374972519220051
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5194091a319fd2529e2ea28ddb1579cabc729483
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.889,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009938701010583726
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.885,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.010093407594904603
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1875b19dafbb38303dca1885adc01f065d496aa7
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.908,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009144376393151086
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.895,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009698921026024944
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb00c22fab327575e3016b251c4f340f0d91c584
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.91,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.00905439020486644
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.904,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009320454434783246
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f24ddbdc7e951f6ead25feab840563ab818af86e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.918,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008680515615523722
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.908,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009144376393151084
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..375bb11d2eb0c0d1fd4131a8e2ac1400cd3f2206
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Direct-Question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.921,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008534156773333457
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.919,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.00863212103213997
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..79f4651fc2a6e800d05fd767e67f61dbcc3ed421
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.311,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014645596385722688
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.35,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015090650341444233
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9065c5763869d211e76620398aebec07585248b0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.349,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015080663991563097
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.354,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015129868238451773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..19a84037a14c7921ca891ccd66832281d81be3f3
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.335,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014933117490932577
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.35,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015090650341444233
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ad8777e47dcae307c02d7c744484abf1db983e4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.349,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015080663991563102
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.36,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015186527932040126
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9209d1b6cda78882d7f811597a0412c25a12f6ff
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.347,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01506047203170662
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.341,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014998131348402718
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2638beb9723b70f871df02b8f27f162c3aa6096
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.331,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01488827258820394
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.339,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014976758771620342
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5f84c760a9db4b46396ab409930dbc6ab91358e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.285,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01428212095520049
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.303,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014539683710535246
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d1d5ec8dc9f2572f9df99738f3be5a959de0376
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.287,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014312087053809963
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.288,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01432694179723156
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..827666fb517680aa9dfd5adb23a185aaee072bf2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.289,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01434171135829618
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.295,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014428554438445517
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..447bb6e97dd77fdf0e03cbde92fc592f16aac907
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.297,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014456832294801105
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.305,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01456664639466439
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..90aa39644113e59967de50c1bffb655b6804088c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.292,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014385511563477336
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.296,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014442734941575018
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dbff3e99dfd63bf9a5cb7adc37c044a5d518438
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.299,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014484778521220468
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.302,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014526080235459541
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..313bf68bc1f7a5fe6f7f114bb4fbd0e706ff94fe
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.292,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01438551156347735
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.299,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014484778521220466
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..efa8411610ac587a71cd181692c157d07b2b618d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.318,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014734079309311903
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.326,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014830507204541037
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e9523902762973dfee1c133818fde14a18c958a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.289,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014341711358296184
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.298,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014470846741134715
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b3a18a75d383b7f136daa10e1e0de69d0548a80
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.317,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01472167543888022
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.326,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014830507204541033
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca8bb36baa9592f992981a454bd834dded0b8149
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.313,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014671272822977892
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.344,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015029633724408947
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a320852dd084b5e5b2359b506338c57e18636877
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.325,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014818724459095524
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.321,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014770821817934645
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..be290031380c595f7912a8c7b63aaa06896c6d72
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.5045430251202565,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011561954965856519
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.5056119722073757,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561703928784327
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9e29b751faf5f6397bae5e12e0d97a2b1c01149
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4580438268305719,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011521653168224733
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4756814537680385,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011548748301487319
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ad4acce500ff00d889fb6748ba383fb8ae9fe89
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01153405649450586
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.46392303580972744,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153229486915312
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..da21193f82627a49aa95cdaed5397cc3550ba225
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.46178514163548906,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011528611805439893
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4596472474612507,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011524715486240653
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1783330f8d11bfdef9172efc93b96999fbdfd7a2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4660609299839658,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011535764881641411
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4580438268305719,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011521653168224734
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..18ac625fc4766fbf9fb09e8d6599adc71650b029
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.46285408872260825,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011530479981182628
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.46178514163548906,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011528611805439891
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..795bca047fc2d92b5ed09e064981047116f8372f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.49706039551042225,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011562232421541939
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.5146980224478889,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011557435464292921
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..012ce18da6ba7b96cabda94b44ff07de6aa18c87
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.46980224478888294,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011541325320336616
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4949225013361839,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561836054238772
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e4ccacfea57fa1378eef8695ef2aeef48d72b19
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4548369855692143,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011515167912227987
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4660609299839658,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011535764881641411
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c59cdd2ef468340cd069b1876fda3293858ef58
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4580438268305719,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011521653168224729
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4607161945483699,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011526690316014587
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..941e2ecbf8659033cf7a4f6d3fc885514cf3b471
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4596472474612507,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011524715486240648
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.46125066809192944,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011527657726586461
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bace0d77477a953cf088bdd0f29c7e4e8cc5b7c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.464457509353287,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011533182338113984
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4548369855692143,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011515167912227987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9960e7372164edbbddd9f970b5ca2f6af9827369
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3594fe24f6d84298c28347deb8bae21f878e433
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad6e786e06d6fc5122fe34a17e9f8cc89cfadff9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4916408305256a532afdc9c55492b33001743d8f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ecddfcfeb6be1e8d5a3b1c6ccb8a7f032a3da8e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce4c3819224db0e10c7b665aa02d91b85d3b7719
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae5f121706d6fa83304b92001905b8845806e187
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.5050774986638161,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01156183605423878
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.51309460181721,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01155846638336718
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..767b9d84be5cb91d2741a9b532d3a0b7c24ee08a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4660609299839658,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011535764881641411
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4804917156600748,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011553628196999307
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4eebd61c539a234d6424bfb26c92e9ee7cd3e54c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01153405649450586
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.46392303580972744,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153229486915312
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..37ee69b706746caf6abf9a927c7f51e024ae92c5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4665954035275254,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011536599118298178
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4575093532870123,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011520605695184077
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4de1e5f383874ddb823947c5d1acfbcd1f0efad
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4548369855692143,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011515167912227987
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4569748797434527,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01151954486592806
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cf9de3f8a272ae78f5d4cad674fc844c14c138f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4575093532870123,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01152060569518408
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011534056494505868
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a34cb5850e8263fdcb832ef335c3bd69ff48d8b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.5034740780331374,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011562153149168303
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.5189738107963656,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.0115541041740197
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c62ef1c02349337ff18b184f1035b310a0a595bd
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4660609299839658,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011535764881641411
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.49545697487974344,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561954965856516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf418f486628b74a67b2d086ef571c6aef946948
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4489577765900588,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011502027057558893
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4649919828968466,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011534056494505868
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..53dc6fcbc5f2bdc7ef6880d14d054a18187562dc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.46178514163548906,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011528611805439891
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4559059326563335,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011517383123961536
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a36b020dfa31c24a8c67871279e8bad696eb56a6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.46178514163548906,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011528611805439891
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.46392303580972744,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011532294869153118
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea1123006b34674019043f801364c50c93709aa4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.47140566541956175,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011543509045585206
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4607161945483699,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011526690316014583
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..81917713486b6b9e55b8e16289d2832ae0ea955e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5667870036101083,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.02982676408213827
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.47653429602888087,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..46d15cb5f9f8a603dc7cbdbc710e824a72e62834
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030063300411902652
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..89932803dc887621ad10f3482b4387d70a144c95
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03003973059219781
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030025579819366422
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..accfaadbb4a8709a414125776507b43a71ee20ec
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5415162454873647,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029992535385373314
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030039730592197812
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b628db69d7cdee72070d13a4ac2919181c0194b2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5523465703971119,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029931070362939526
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143713
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1670965ae71738c16aa6330f9ae0516097e721b
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5df9559eca3dca4531bd0db7e7115968dd309775
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2ad898352bb6470e59f2c5fe70eb1bee713546e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..82cd88b76f74652c9a254ea901b4c08fce2f2cce
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..669c06c80d3f882a4262558a8b051cdb6b32e63f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030009848912529113
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03003973059219781
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a490aaf88f4364562fb1a8df02db40fa436e95c9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5451263537906137,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029973636495415252
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5487364620938628,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029953149241808946
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a87800b832fc9668005282a10f7b68ad64a0030e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec3318c26b883980fc9e9bd3fe0214af12dd3067
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030025579819366422
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..07372f9ff7f97dc438d56be91589feef77a035d9
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976633
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..792239233abc601ed64d352928a4e3cc7d8a1449
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa4294a888a39d3682da6f5708189cc8fa833bb1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5523465703971119,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.02993107036293953
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5523465703971119,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.02993107036293953
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6f4140a970c20a55efb9c49309ebc4dbdd075e0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030009848912529117
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317184
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c85add76686a8dedc818d5495954961592865cb
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5451263537906137,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029973636495415255
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.555956678700361,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029907396333795997
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..67f1bbabb5ee0e3c47b8e00c87297966421ca4c7
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..336ad091092635608db5dabecbf4a89f0794ef33
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.48736462093862815,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4465388b91f9c166bc5217397e01fa8b9a7d3d5d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ef401fad30bf7562c4c3de70c26123cb475c7f5
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030052303463143706
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03003973059219781
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6303edabb9374a08c6fa6caf2fef6c08dd55f9fc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5487364620938628,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029953149241808946
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5415162454873647,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029992535385373314
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bb20657c43c622430c5d8e45e3b3c3fd390f101
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5415162454873647,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029992535385373314
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.555956678700361,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029907396333795997
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..841f6d3f45f244e9412659d884d9c3e4eaf757d8
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03003973059219781
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..45dcfd54cc25f7877c946db0ea774e25542834fc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1e27fabae38eb14f8e17306ad26d8e4c0fc2f89
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e7fa99250f0a2132ccc56ff20edf7ee361a6569
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5379061371841155,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030009848912529117
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030025579819366426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc86afb08eca59ee53feb200bef3882f027a4cc0
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5487364620938628,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.029953149241808943
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.555956678700361,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.029907396333795994
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc684dad67e6fab9680ebfabcda4e2cab0190a6d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5523465703971119,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.02993107036293953
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5523465703971119,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.02993107036293953
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e3af3c2ebaeb55b5fd967a086229f456e026102
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.4940805051302289,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051500838485807
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.49171270718232046,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050555322824192
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dcc52b0892afb35258df888dfc95d4a11ebc9c9f
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051500838485807
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052271211616438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..035c21c03ba61d2e5691089275c481926a465669
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.49329123914759276,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051220692330346
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.4980268350434096,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225632
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f964d97f817f09663790208f3bd7d3acaa50be61
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052131146915867
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330352
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..39bf0c9942b790242d63f1a8c99a6214fe32fc9a
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045826789783654
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783656
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..56e59898a28eb127dd6c31949e7eaff1b312ec11
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_Replace_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5217048145224941,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014039239216484624
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049294536290396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aed37a6748c25782ae39a9a256d1aa40694a6c3e
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076896
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.489344909234412,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.0140492945362904
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f1b795e24c8e006e3398fb7400aa07ef1e5ebab
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4877663772691397,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014048278820405616
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4846093133385951,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014045826789783663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..15cab40a7a0dc5dba281276d2c574d81b8b32956
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076906
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049512
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..91735450d4e1e4b89387bb2af52f291fee469c83
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051500838485807
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014048278820405621
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..35394fff522cbc1dea412b6a15acd9787497fb7d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4988161010260458,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052446290529019
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc20de6599824291e7b13cd4ee814a4c70b69a96
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_True-or-False_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616436
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5169692186266772,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014044390401612976
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c9539938be62ff21f20d709eb4b16c214a088c1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.4909234411996843,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140501700944977
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.494869771112865,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051745961790516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..770d1a138f1d7b0287222cf6ab84dc8fea16ddff
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404771839399767
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051956064076911
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..92b4171d56d3742f8f8042d9b031bf9aa8924be6
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405090552122858
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049512
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9922d64b8ac4b3f0c7095c967ab921ee36731fbc
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5153906866614049,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045826789783656
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529019
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b756cecfff1e3413e1097a2907240c5b09429b33
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051500838485807
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529022
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d84dcb51b5f6668999b5e6e5c4f2d23f6b556ae
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.4988161010260458,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052446290529015
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..db615ecf5fb3e70843165b375c53ceb8334007ba
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5217048145224941,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014039239216484622
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5098658247829518,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049749833367589
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bf5a893148d857fbd96ccff240c57f16cd0d405
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978594
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050905521228577
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..10bb33ddf3aacba98193bd1c51b571419b0c17a1
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5090765588003157,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140501700944977
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049512
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..475408f39ff0727a2f18482fac0195e140c530e4
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5256511444356748,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014033980956108557
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5224940805051302,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01403825782405988
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f05de14cfc06245b3ba26892ca00b5124ab971c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404827882040562
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5185477505919495,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014042813708888378
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cde9fc3795e35829a7b8beb43983c2ba431c316
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_stand-for_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014048278820405621
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_0.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b1c700a9981a7d7c600105b2cbee93a43e4fa70
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405237625922564
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.489344909234412,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.0140492945362904
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_1.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..93a6d0d0b392074ed36a841bb7313290422b3e7d
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.505130228887135,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051745961790513
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529024
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_2.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..17973936376312a9b00488ca0d70bd64eb4c3158
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616438
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01404827882040562
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_3.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b848faf26115e0b3ed8236bd3578daf3046390e2
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076892
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051500838485807
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_4.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..424cb8a09ab348bac8ae076592656f03b3db762c
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076906
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01405090552122858
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_5.json b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e3864333328112955d167594387bf8e5034cd69
--- /dev/null
+++ b/4b284b28boscar/eval/slim.4b284b28boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052446290529019
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5090765588003157,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050170094497697
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/merged.csv b/4b284b28boscar/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..9bed45cf57d4752fdd508100357e671747f419a4
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/merged.csv
@@ -0,0 +1,53 @@
+dataset,fewshots,prompt,metric,value
+e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0
+e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0
+e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21912991483763275
+e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21912991483763275
+e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2450476239671336
+e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2450476239671336
+e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2506081474778741
+e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2506081474778741
+e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2523951605760761
+e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2523951605760761
+e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.25480077699734965
+e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.25480077699734965
+e2e_nlg_cleaned,5,average,multiple,0.20366360397601102
+gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04748504386381342
+gem_xsum,0,median,rouge2_fmeasure,0.04748504386381342
+gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.0455477081115806
+gem_xsum,1,median,rouge2_fmeasure,0.0455477081115806
+gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05025303080761008
+gem_xsum,2,median,rouge2_fmeasure,0.05025303080761008
+gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.0527365635483454
+gem_xsum,3,median,rouge2_fmeasure,0.0527365635483454
+gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014234207773434364
+gem_xsum,4,median,rouge2_fmeasure,0.014234207773434364
+gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
+gem_xsum,5,median,rouge2_fmeasure,0.0
+gem_xsum,5,average,multiple,0.035042759017463974
+web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.044864078248862495
+web_nlg_en,0,median,rouge2_fmeasure,0.044864078248862495
+web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07803113267822599
+web_nlg_en,1,median,rouge2_fmeasure,0.07803113267822599
+web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10282617536971313
+web_nlg_en,2,median,rouge2_fmeasure,0.10282617536971313
+web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11040571352169745
+web_nlg_en,3,median,rouge2_fmeasure,0.11040571352169745
+web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.11494130887956767
+web_nlg_en,4,median,rouge2_fmeasure,0.11494130887956767
+web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.12304296607701624
+web_nlg_en,5,median,rouge2_fmeasure,0.12304296607701624
+web_nlg_en,5,average,multiple,0.0956852291291805
+wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03380900442536606
+wiki_lingua_en,0,median,rouge2_fmeasure,0.03380900442536606
+wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05447927752503051
+wiki_lingua_en,1,median,rouge2_fmeasure,0.05447927752503051
+wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.0696826972387138
+wiki_lingua_en,2,median,rouge2_fmeasure,0.0696826972387138
+wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05970860054565866
+wiki_lingua_en,3,median,rouge2_fmeasure,0.05970860054565866
+wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01919932938843845
+wiki_lingua_en,4,median,rouge2_fmeasure,0.01919932938843845
+wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0032474649318922875
+wiki_lingua_en,5,median,rouge2_fmeasure,0.0032474649318922875
+wiki_lingua_en,5,average,multiple,0.04002106234251663
diff --git a/4b284b28boscar/evaluation/generation/merged.json b/4b284b28boscar/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1ea5f8567794090ea36422ae2e661dad648bdc6
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3236606671689831, "bleu_stderr": 0.03434728960928616, "rouge1_fmeasure": 0.09843971445320344, "rouge1_fmeasure_stderr": 0.0020285748309311217, "rouge1_precision": 0.07548519902353701, "rouge1_precision_stderr": 0.0027103393867684133, "rouge1_recall": 0.27434436630785664, "rouge1_recall_stderr": 0.005190735227370315, "rouge2_fmeasure": 0.044864078248862495, "rouge2_fmeasure_stderr": 0.0012327331599678007, "rouge2_precision": 0.03280247893554901, "rouge2_precision_stderr": 0.0015645885841163746, "rouge2_recall": 0.1283696309182241, "rouge2_recall_stderr": 0.0032411037301123703, "rougeL_fmeasure": 0.09452977794835304, "rougeL_fmeasure_stderr": 0.0019058829105986163, "rougeL_precision": 0.07235248106657086, "rougeL_precision_stderr": 0.002585891559293976, "rougeL_recall": 0.26594645785931426, "rougeL_recall_stderr": 0.005067281777094968, "rougeLsum_fmeasure": 0.09372697520153284, "rougeLsum_fmeasure_stderr": 0.0019257190223106488, "rougeLsum_precision": 0.07204958395594553, "rougeLsum_precision_stderr": 0.00260168307386312, "rougeLsum_recall": 0.2598611687840442, "rougeLsum_recall_stderr": 0.004868835623139477}}, "1": {"PALM_prompt": {"bleu": 0.5303952344866372, "bleu_stderr": 0.03866628930999967, "rouge1_fmeasure": 0.15526776951779941, "rouge1_fmeasure_stderr": 0.0037112903723659145, "rouge1_precision": 0.13500481273386208, "rouge1_precision_stderr": 0.004456041975528417, "rouge1_recall": 0.3050209335340558, "rouge1_recall_stderr": 0.005019779051030469, "rouge2_fmeasure": 0.07803113267822599, "rouge2_fmeasure_stderr": 0.0024911371039099254, "rouge2_precision": 0.06800836209907518, "rouge2_precision_stderr": 0.0029183003453498208, "rouge2_recall": 0.15616590508602413, "rouge2_recall_stderr": 0.0035788033957169676, "rougeL_fmeasure": 0.1409133691299396, "rougeL_fmeasure_stderr": 0.003170026813140542, "rougeL_precision": 0.12100254673151754, "rougeL_precision_stderr": 0.0038758351195453834, "rougeL_recall": 0.28545106347976323, "rougeL_recall_stderr": 0.004617233759164798, "rougeLsum_fmeasure": 0.14369743370526267, "rougeLsum_fmeasure_stderr": 0.003262924473269048, "rougeLsum_precision": 0.12390505105203532, "rougeLsum_precision_stderr": 0.003973176754882277, "rougeLsum_recall": 0.28858761694209856, "rougeLsum_recall_stderr": 0.004669345842896859}}, "2": {"PALM_prompt": {"bleu": 0.6963169812480917, "bleu_stderr": 0.025883064453583102, "rouge1_fmeasure": 0.19513258665465127, "rouge1_fmeasure_stderr": 0.0043495841141288575, "rouge1_precision": 0.1792256974533611, "rouge1_precision_stderr": 0.005365719346439527, "rouge1_recall": 0.34488184545131717, "rouge1_recall_stderr": 0.004970214155406786, "rouge2_fmeasure": 0.10282617536971313, "rouge2_fmeasure_stderr": 0.002972081492632468, "rouge2_precision": 0.0957315113405197, "rouge2_precision_stderr": 0.0035283155722927633, "rouge2_recall": 0.1834475236414602, "rouge2_recall_stderr": 0.0037698051706010482, "rougeL_fmeasure": 0.1746470290433306, "rougeL_fmeasure_stderr": 0.003733649528603583, "rougeL_precision": 0.15875527177897067, "rougeL_precision_stderr": 0.004689203205430283, "rougeL_recall": 0.31884399599505703, "rougeL_recall_stderr": 0.004581869680066843, "rougeLsum_fmeasure": 0.17894683258352004, "rougeLsum_fmeasure_stderr": 0.003839954827867676, "rougeLsum_precision": 0.16348748083622414, "rougeLsum_precision_stderr": 0.004839999383468051, "rougeLsum_recall": 0.3238241924312652, "rougeLsum_recall_stderr": 0.0046243997645253}}, "3": {"PALM_prompt": {"bleu": 0.8274667802488902, "bleu_stderr": 0.042366964185806814, "rouge1_fmeasure": 0.20403726404125458, "rouge1_fmeasure_stderr": 0.004566219372488845, "rouge1_precision": 0.18955004198888442, "rouge1_precision_stderr": 0.005662094291901046, "rouge1_recall": 0.3533218112336572, "rouge1_recall_stderr": 0.004960841139206212, "rouge2_fmeasure": 0.11040571352169745, "rouge2_fmeasure_stderr": 0.0032753908389439508, "rouge2_precision": 0.10477428578899617, "rouge2_precision_stderr": 0.003935236903263568, "rouge2_recall": 0.1901585058979038, "rouge2_recall_stderr": 0.0038815093874070626, "rougeL_fmeasure": 0.18223468887125507, "rougeL_fmeasure_stderr": 0.003924418368935964, "rougeL_precision": 0.16706159999567868, "rougeL_precision_stderr": 0.004913635515279735, "rougeL_recall": 0.32629206613585876, "rougeL_recall_stderr": 0.004527254280455867, "rougeLsum_fmeasure": 0.18670844563664446, "rougeLsum_fmeasure_stderr": 0.0040383704320243125, "rougeLsum_precision": 0.17216966052247287, "rougeLsum_precision_stderr": 0.005091094356122184, "rougeLsum_recall": 0.3313273065171706, "rougeLsum_recall_stderr": 0.0045736991609824015}}, "4": {"PALM_prompt": {"bleu": 0.9591911553106375, "bleu_stderr": 0.05993639067207907, "rouge1_fmeasure": 0.2120064135279467, "rouge1_fmeasure_stderr": 0.004550653208173271, "rouge1_precision": 0.1924412284693008, "rouge1_precision_stderr": 0.005450604205447632, "rouge1_recall": 0.3705586639190707, "rouge1_recall_stderr": 0.005109363173845363, "rouge2_fmeasure": 0.11494130887956767, "rouge2_fmeasure_stderr": 0.0032162324276510928, "rouge2_precision": 0.10648010653830348, "rouge2_precision_stderr": 0.00377122887055479, "rouge2_recall": 0.2022148169333304, "rouge2_recall_stderr": 0.003998276489277708, "rougeL_fmeasure": 0.18886918285090654, "rougeL_fmeasure_stderr": 0.0038962288174863224, "rougeL_precision": 0.16950864458355333, "rougeL_precision_stderr": 0.004740831041491813, "rougeL_recall": 0.34083637585044774, "rougeL_recall_stderr": 0.004631727723114902, "rougeLsum_fmeasure": 0.1940229701192449, "rougeLsum_fmeasure_stderr": 0.00401611332459292, "rougeLsum_precision": 0.17511108437715225, "rougeLsum_precision_stderr": 0.004905333361798361, "rougeLsum_recall": 0.34672946562667306, "rougeLsum_recall_stderr": 0.00468590709544727}}, "5": {"PALM_prompt": {"bleu": 1.0660500946745477, "bleu_stderr": 0.057054038562455854, "rouge1_fmeasure": 0.22541411583738258, "rouge1_fmeasure_stderr": 0.0046964527830320015, "rouge1_precision": 0.2143907231679788, "rouge1_precision_stderr": 0.00587683477313044, "rouge1_recall": 0.37935769424464766, "rouge1_recall_stderr": 0.005114542762979336, "rouge2_fmeasure": 0.12304296607701624, "rouge2_fmeasure_stderr": 0.0033297218439898735, "rouge2_precision": 0.11999776807861791, "rouge2_precision_stderr": 0.0040878065570314984, "rouge2_recall": 0.20846703611506925, "rouge2_recall_stderr": 0.004052332137679142, "rougeL_fmeasure": 0.19952305726552913, "rougeL_fmeasure_stderr": 0.004002247434870572, "rougeL_precision": 0.1874233346433946, "rougeL_precision_stderr": 0.0050850214855498194, "rougeL_recall": 0.3480461100730932, "rougeL_recall_stderr": 0.004676777395335985, "rougeLsum_fmeasure": 0.2058454997784591, "rougeLsum_fmeasure_stderr": 0.004150385211942694, "rougeLsum_precision": 0.19460128268928695, "rougeLsum_precision_stderr": 0.005298127155329163, "rougeLsum_recall": 0.3552748425353144, "rougeLsum_recall_stderr": 0.004735986385918885}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.36187522199539, "bleu_stderr": 0.13845059941627463, "rouge1_fmeasure": 0.14568887330505176, "rouge1_fmeasure_stderr": 0.0024304012352800524, "rouge1_precision": 0.1325770613482503, "rouge1_precision_stderr": 0.0025192320948419973, "rouge1_recall": 0.20000361049618887, "rouge1_recall_stderr": 0.0033883319122602765, "rouge2_fmeasure": 0.03380900442536606, "rouge2_fmeasure_stderr": 0.0009820401088417524, "rouge2_precision": 0.030157012060673342, "rouge2_precision_stderr": 0.0009243132657339652, "rouge2_recall": 0.04736363206911103, "rouge2_recall_stderr": 0.0014655640703641456, "rougeL_fmeasure": 0.11261046327516765, "rougeL_fmeasure_stderr": 0.0017887243866971374, "rougeL_precision": 0.10196660477211375, "rougeL_precision_stderr": 0.0018957212855749772, "rougeL_recall": 0.15791691922428616, "rougeL_recall_stderr": 0.0026796281489341964, "rougeLsum_fmeasure": 0.13507023756955555, "rougeLsum_fmeasure_stderr": 0.002258433462891094, "rougeLsum_precision": 0.1229739908341885, "rougeLsum_precision_stderr": 0.00235776548625545, "rougeLsum_recall": 0.18595911596519776, "rougeLsum_recall_stderr": 0.003178608298694486}}, "1": {"tldr_en": {"bleu": 3.3670705128175316, "bleu_stderr": 0.11604750071877634, "rouge1_fmeasure": 0.2120115141592031, "rouge1_fmeasure_stderr": 0.002201889759622243, "rouge1_precision": 0.253166878261391, "rouge1_precision_stderr": 0.00348988815268275, "rouge1_recall": 0.24765343018422936, "rouge1_recall_stderr": 0.0029563082706124282, "rouge2_fmeasure": 0.05447927752503051, "rouge2_fmeasure_stderr": 0.0012618043977617596, "rouge2_precision": 0.0699683530277847, "rouge2_precision_stderr": 0.0020960827184197486, "rouge2_recall": 0.06305007691775547, "rouge2_recall_stderr": 0.0015388476493559788, "rougeL_fmeasure": 0.1602275848002253, "rougeL_fmeasure_stderr": 0.001660994268757166, "rougeL_precision": 0.19452273753171953, "rougeL_precision_stderr": 0.00287285443881589, "rougeL_recall": 0.18824812828488288, "rougeL_recall_stderr": 0.002306801191665917, "rougeLsum_fmeasure": 0.19811964314605882, "rougeLsum_fmeasure_stderr": 0.002066579949503973, "rougeLsum_precision": 0.23777364194884903, "rougeLsum_precision_stderr": 0.0033469123908777755, "rougeLsum_recall": 0.2311323526084588, "rougeLsum_recall_stderr": 0.00275352833642066}}, "2": {"tldr_en": {"bleu": 4.453155894580253, "bleu_stderr": 0.07102095729922746, "rouge1_fmeasure": 0.2423480992295036, "rouge1_fmeasure_stderr": 0.002213386982039197, "rouge1_precision": 0.3042280180045783, "rouge1_precision_stderr": 0.0036548780493867858, "rouge1_recall": 0.2657722896569855, "rouge1_recall_stderr": 0.002856010555746096, "rouge2_fmeasure": 0.0696826972387138, "rouge2_fmeasure_stderr": 0.00138974451583577, "rouge2_precision": 0.09313769731615627, "rouge2_precision_stderr": 0.002310899996162063, "rouge2_recall": 0.07567489802793778, "rouge2_recall_stderr": 0.00165355417900699, "rougeL_fmeasure": 0.18663522201860716, "rougeL_fmeasure_stderr": 0.001740118367633801, "rougeL_precision": 0.23678748519073842, "rougeL_precision_stderr": 0.0030313841471723817, "rougeL_recall": 0.2052468833712137, "rougeL_recall_stderr": 0.002285031155511772, "rougeLsum_fmeasure": 0.22792203815398218, "rougeLsum_fmeasure_stderr": 0.0021140458556974436, "rougeLsum_precision": 0.28748422717826916, "rougeLsum_precision_stderr": 0.0035473688052382567, "rougeLsum_recall": 0.24946630704943015, "rougeLsum_recall_stderr": 0.0026967966099014384}}, "3": {"tldr_en": {"bleu": 3.3998244876185093, "bleu_stderr": 0.07879957613227935, "rouge1_fmeasure": 0.20630036414089614, "rouge1_fmeasure_stderr": 0.0026291569934361863, "rouge1_precision": 0.2674845496647863, "rouge1_precision_stderr": 0.0039889863741841, "rouge1_recall": 0.22160894672160433, "rouge1_recall_stderr": 0.0032468878599473164, "rouge2_fmeasure": 0.05970860054565866, "rouge2_fmeasure_stderr": 0.0013699064127955533, "rouge2_precision": 0.08211313390771198, "rouge2_precision_stderr": 0.0022759780285685387, "rouge2_recall": 0.06436903346497445, "rouge2_recall_stderr": 0.0016476213678321223, "rougeL_fmeasure": 0.15931256883075381, "rougeL_fmeasure_stderr": 0.0020301040819156045, "rougeL_precision": 0.20900633859522688, "rougeL_precision_stderr": 0.0032474779669193566, "rougeL_recall": 0.17205068176936356, "rougeL_recall_stderr": 0.002592219077108911, "rougeLsum_fmeasure": 0.19413853066180423, "rougeLsum_fmeasure_stderr": 0.0024798764798439764, "rougeLsum_precision": 0.2532503820549326, "rougeLsum_precision_stderr": 0.00383540860381719, "rougeLsum_recall": 0.2082619543002508, "rougeLsum_recall_stderr": 0.00305787284903526}}, "4": {"tldr_en": {"bleu": 0.13483879286922137, "bleu_stderr": 0.011222185156728685, "rouge1_fmeasure": 0.06668350472619872, "rouge1_fmeasure_stderr": 0.0023223709305170443, "rouge1_precision": 0.092208189110962, "rouge1_precision_stderr": 0.0034189214358844324, "rouge1_recall": 0.07224586073962391, "rouge1_recall_stderr": 0.002716708158950956, "rouge2_fmeasure": 0.01919932938843845, "rouge2_fmeasure_stderr": 0.0010049481476114978, "rouge2_precision": 0.0276141917171982, "rouge2_precision_stderr": 0.0016764660911735634, "rouge2_recall": 0.021177864192540377, "rouge2_recall_stderr": 0.0012371206877780988, "rougeL_fmeasure": 0.05216028088524061, "rougeL_fmeasure_stderr": 0.0018383425985936733, "rougeL_precision": 0.07354514541641334, "rougeL_precision_stderr": 0.002828654152146427, "rougeL_recall": 0.056562703185008854, "rougeL_recall_stderr": 0.002168458322152195, "rougeLsum_fmeasure": 0.062781251763141, "rougeLsum_fmeasure_stderr": 0.002197731580536598, "rougeLsum_precision": 0.0875093402820744, "rougeLsum_precision_stderr": 0.003281948169701787, "rougeLsum_recall": 0.06780620805618982, "rougeLsum_recall_stderr": 0.0025609050791819783}}, "5": {"tldr_en": {"bleu": 1.1051284504073666e-13, "bleu_stderr": 2.6395896486810652e-12, "rouge1_fmeasure": 0.010526309548207395, "rouge1_fmeasure_stderr": 0.0010240475142557462, "rouge1_precision": 0.015272421462795964, "rouge1_precision_stderr": 0.0015642398537380597, "rouge1_recall": 0.011340665089531033, "rouge1_recall_stderr": 0.0012049966779817216, "rouge2_fmeasure": 0.0032474649318922875, "rouge2_fmeasure_stderr": 0.00043456520285324997, "rouge2_precision": 0.004861708599486756, "rouge2_precision_stderr": 0.000775800963972687, "rouge2_recall": 0.003729726793524184, "rouge2_recall_stderr": 0.0005770564824763594, "rougeL_fmeasure": 0.008507384832807946, "rougeL_fmeasure_stderr": 0.00083698404273872, "rougeL_precision": 0.012655622009735225, "rougeL_precision_stderr": 0.001358908157095457, "rougeL_recall": 0.009170085952684662, "rougeL_recall_stderr": 0.0009929814693818242, "rougeLsum_fmeasure": 0.010004236605713649, "rougeLsum_fmeasure_stderr": 0.0009708230001026674, "rougeLsum_precision": 0.01465587941098198, "rougeLsum_precision_stderr": 0.001510887749703006, "rougeLsum_recall": 0.01078936789465723, "rougeLsum_recall_stderr": 0.0011506077585367758}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}, "1": {"generate_text_restaurant": {"bleu": 11.807245592727982, "bleu_stderr": 0.14268734921118004, "rouge1_fmeasure": 0.46557148222586153, "rouge1_fmeasure_stderr": 0.0023657331986333577, "rouge1_precision": 0.5694397994427649, "rouge1_precision_stderr": 0.003239379072871242, "rouge1_recall": 0.43330345866585573, "rouge1_recall_stderr": 0.003044911625941733, "rouge2_fmeasure": 0.21912991483763275, "rouge2_fmeasure_stderr": 0.002069315036688718, "rouge2_precision": 0.2723840046490374, "rouge2_precision_stderr": 0.0027516718471111436, "rouge2_recall": 0.20361914910986256, "rouge2_recall_stderr": 0.0022039744834451503, "rougeL_fmeasure": 0.33955869538197114, "rougeL_fmeasure_stderr": 0.002087373657004864, "rougeL_precision": 0.4186095218334134, "rougeL_precision_stderr": 0.002968616014528494, "rougeL_recall": 0.3149916119112548, "rougeL_recall_stderr": 0.0024715985756080345, "rougeLsum_fmeasure": 0.38066023518841285, "rougeLsum_fmeasure_stderr": 0.0023459300318536126, "rougeLsum_precision": 0.46699348061463236, "rougeLsum_precision_stderr": 0.003191007134428128, "rougeLsum_recall": 0.35381509149376406, "rougeLsum_recall_stderr": 0.002783989467610118}}, "2": {"generate_text_restaurant": {"bleu": 14.207475603022154, "bleu_stderr": 0.20911303585808974, "rouge1_fmeasure": 0.49425084203142144, "rouge1_fmeasure_stderr": 0.0023454913129034973, "rouge1_precision": 0.5838609934278248, "rouge1_precision_stderr": 0.003219893788304826, "rouge1_recall": 0.4676453406199286, "rouge1_recall_stderr": 0.003012959553547953, "rouge2_fmeasure": 0.2450476239671336, "rouge2_fmeasure_stderr": 0.0021622871202404523, "rouge2_precision": 0.2920964834179201, "rouge2_precision_stderr": 0.002749941677148143, "rouge2_recall": 0.23229590203180275, "rouge2_recall_stderr": 0.002340599122551353, "rougeL_fmeasure": 0.3646666089186555, "rougeL_fmeasure_stderr": 0.0021523804779859127, "rougeL_precision": 0.4326595154400953, "rougeL_precision_stderr": 0.0029553909822487533, "rougeL_recall": 0.34449565433901846, "rougeL_recall_stderr": 0.002541682059011724, "rougeLsum_fmeasure": 0.41151419710242576, "rougeLsum_fmeasure_stderr": 0.0023890835672487213, "rougeLsum_precision": 0.48625392335924433, "rougeLsum_precision_stderr": 0.0031665719502181147, "rougeLsum_recall": 0.3893023276736312, "rougeLsum_recall_stderr": 0.0028360977602737624}}, "3": {"generate_text_restaurant": {"bleu": 14.881355021152629, "bleu_stderr": 0.14798982038506134, "rouge1_fmeasure": 0.5001074662527342, "rouge1_fmeasure_stderr": 0.0022864719896120603, "rouge1_precision": 0.5813338263183422, "rouge1_precision_stderr": 0.0031068563548781497, "rouge1_recall": 0.47562332710532934, "rouge1_recall_stderr": 0.0029492958281890696, "rouge2_fmeasure": 0.2506081474778741, "rouge2_fmeasure_stderr": 0.0021353634161434465, "rouge2_precision": 0.2933839366604696, "rouge2_precision_stderr": 0.002656083464844314, "rouge2_recall": 0.23916505911615024, "rouge2_recall_stderr": 0.0023564329861628973, "rougeL_fmeasure": 0.36772667157315414, "rougeL_fmeasure_stderr": 0.002137446714056548, "rougeL_precision": 0.42886203590947314, "rougeL_precision_stderr": 0.0028648415383139367, "rougeL_recall": 0.34942359580045534, "rougeL_recall_stderr": 0.0025299540186551716, "rougeLsum_fmeasure": 0.41854738428796234, "rougeLsum_fmeasure_stderr": 0.0023767509258669573, "rougeLsum_precision": 0.48630515550742015, "rougeLsum_precision_stderr": 0.0030844914644101774, "rougeLsum_recall": 0.3982270500785823, "rougeLsum_recall_stderr": 0.0028293510736079205}}, "4": {"generate_text_restaurant": {"bleu": 15.161167960986678, "bleu_stderr": 0.1434464937691823, "rouge1_fmeasure": 0.5031994429048829, "rouge1_fmeasure_stderr": 0.002288693420421454, "rouge1_precision": 0.5828889415157229, "rouge1_precision_stderr": 0.003147155952785506, "rouge1_recall": 0.47816618968937713, "rouge1_recall_stderr": 0.0028905507376413963, "rouge2_fmeasure": 0.2523951605760761, "rouge2_fmeasure_stderr": 0.0021595061752098924, "rouge2_precision": 0.2947392128854523, "rouge2_precision_stderr": 0.002705329023608755, "rouge2_recall": 0.24017998180506026, "rouge2_recall_stderr": 0.0023323057167825125, "rougeL_fmeasure": 0.37084325448755734, "rougeL_fmeasure_stderr": 0.002169564896258564, "rougeL_precision": 0.43041916996749047, "rougeL_precision_stderr": 0.002887136906682548, "rougeL_recall": 0.352331798953218, "rougeL_recall_stderr": 0.0025196691672468205, "rougeLsum_fmeasure": 0.42211119537556474, "rougeLsum_fmeasure_stderr": 0.002395568997982313, "rougeLsum_precision": 0.48846331616257255, "rougeLsum_precision_stderr": 0.003116275576376514, "rougeLsum_recall": 0.4014433365412298, "rougeLsum_recall_stderr": 0.0028080914024126535}}, "5": {"generate_text_restaurant": {"bleu": 15.199579399737006, "bleu_stderr": 0.2197710642602159, "rouge1_fmeasure": 0.5056858022442728, "rouge1_fmeasure_stderr": 0.0022854558866388314, "rouge1_precision": 0.5888360095441599, "rouge1_precision_stderr": 0.003159796703393573, "rouge1_recall": 0.4770950472138781, "rouge1_recall_stderr": 0.0028673153056796336, "rouge2_fmeasure": 0.25480077699734965, "rouge2_fmeasure_stderr": 0.002191468155403316, "rouge2_precision": 0.2996637182544266, "rouge2_precision_stderr": 0.002755313978179728, "rouge2_recall": 0.24042124929715428, "rouge2_recall_stderr": 0.0023404910908957914, "rougeL_fmeasure": 0.3714282516661889, "rougeL_fmeasure_stderr": 0.0021747599500282675, "rougeL_precision": 0.4337481481034359, "rougeL_precision_stderr": 0.0029181057449224846, "rougeL_recall": 0.35012226658269135, "rougeL_recall_stderr": 0.002503515068580577, "rougeLsum_fmeasure": 0.4246436350463985, "rougeLsum_fmeasure_stderr": 0.0023956383797766666, "rougeLsum_precision": 0.49452192644513626, "rougeLsum_precision_stderr": 0.0031483071698951075, "rougeLsum_recall": 0.4006792202843592, "rougeLsum_recall_stderr": 0.002780394897574505}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.051690852375094, "bleu_stderr": 0.0919729448596882, "rouge1_fmeasure": 0.19658526341354268, "rouge1_fmeasure_stderr": 0.0030181310698989858, "rouge1_precision": 0.1442510190731307, "rouge1_precision_stderr": 0.002377555886882021, "rouge1_recall": 0.32728957236911, "rouge1_recall_stderr": 0.005137969407531113, "rouge2_fmeasure": 0.04748504386381342, "rouge2_fmeasure_stderr": 0.0016110390499460413, "rouge2_precision": 0.0341581223240638, "rouge2_precision_stderr": 0.0011714333255302305, "rouge2_recall": 0.08176991191725373, "rouge2_recall_stderr": 0.0028112899026499264, "rougeL_fmeasure": 0.14457182138352206, "rougeL_fmeasure_stderr": 0.002237374752785832, "rougeL_precision": 0.10623987893735275, "rougeL_precision_stderr": 0.0018433462761128848, "rougeL_recall": 0.24157333634286496, "rougeL_recall_stderr": 0.003878678496036975, "rougeLsum_fmeasure": 0.15498205908882362, "rougeLsum_fmeasure_stderr": 0.0024811396605881834, "rougeLsum_precision": 0.11372296782139678, "rougeLsum_precision_stderr": 0.001999008147870968, "rougeLsum_recall": 0.25936158541657245, "rougeLsum_recall_stderr": 0.004311577104350546}}, "1": {"article_DOC_summary": {"bleu": 2.019074157200998, "bleu_stderr": 0.1854543722412079, "rouge1_fmeasure": 0.21363061590069807, "rouge1_fmeasure_stderr": 0.0033936589765782323, "rouge1_precision": 0.21346282259038915, "rouge1_precision_stderr": 0.004034521833789361, "rouge1_recall": 0.2499714791367549, "rouge1_recall_stderr": 0.004127977259047798, "rouge2_fmeasure": 0.0455477081115806, "rouge2_fmeasure_stderr": 0.0020625955375166345, "rouge2_precision": 0.0463678167671605, "rouge2_precision_stderr": 0.0022751074397479123, "rouge2_recall": 0.052694807564007726, "rouge2_recall_stderr": 0.002334059403839543, "rougeL_fmeasure": 0.1613719463559732, "rougeL_fmeasure_stderr": 0.0027032991504210456, "rougeL_precision": 0.1620749435386503, "rougeL_precision_stderr": 0.003255513503258099, "rougeL_recall": 0.18819404884414812, "rougeL_recall_stderr": 0.0031630393650993625, "rougeLsum_fmeasure": 0.16555112242432596, "rougeLsum_fmeasure_stderr": 0.0027468382198912893, "rougeLsum_precision": 0.16522714514641704, "rougeLsum_precision_stderr": 0.0032476758014330694, "rougeLsum_recall": 0.195040375695118, "rougeLsum_recall_stderr": 0.003405905178416842}}, "2": {"article_DOC_summary": {"bleu": 2.382425295806561, "bleu_stderr": 0.185437417569327, "rouge1_fmeasure": 0.23064419705209097, "rouge1_fmeasure_stderr": 0.003326199002008987, "rouge1_precision": 0.23937462775757284, "rouge1_precision_stderr": 0.004035761510891073, "rouge1_recall": 0.25228250851051215, "rouge1_recall_stderr": 0.0038577616272618006, "rouge2_fmeasure": 0.05025303080761008, "rouge2_fmeasure_stderr": 0.002124751664870315, "rouge2_precision": 0.05306913769010263, "rouge2_precision_stderr": 0.002351977805815094, "rouge2_recall": 0.05441248374509546, "rouge2_recall_stderr": 0.002305037996521187, "rougeL_fmeasure": 0.1742531731484881, "rougeL_fmeasure_stderr": 0.0026993941796824017, "rougeL_precision": 0.1812036165079183, "rougeL_precision_stderr": 0.0032724786112898127, "rougeL_recall": 0.1907813012976707, "rougeL_recall_stderr": 0.0030970666700906747, "rougeLsum_fmeasure": 0.17671769275213156, "rougeLsum_fmeasure_stderr": 0.002734700050326904, "rougeLsum_precision": 0.1831569851914895, "rougeLsum_precision_stderr": 0.0032779593192606743, "rougeLsum_recall": 0.19469605892775696, "rougeLsum_recall_stderr": 0.003266640672239043}}, "3": {"article_DOC_summary": {"bleu": 2.8525428900629293, "bleu_stderr": 0.1568990466406635, "rouge1_fmeasure": 0.22778807117999614, "rouge1_fmeasure_stderr": 0.0036243774679184364, "rouge1_precision": 0.24352976537606963, "rouge1_precision_stderr": 0.004350047350541235, "rouge1_recall": 0.23908129808521703, "rouge1_recall_stderr": 0.004002524315183232, "rouge2_fmeasure": 0.0527365635483454, "rouge2_fmeasure_stderr": 0.002240194553760494, "rouge2_precision": 0.05729292572552575, "rouge2_precision_stderr": 0.002617653205209828, "rouge2_recall": 0.05451017420559761, "rouge2_recall_stderr": 0.002306325013293872, "rougeL_fmeasure": 0.17283311820835626, "rougeL_fmeasure_stderr": 0.0029786990259966836, "rougeL_precision": 0.18538163733792024, "rougeL_precision_stderr": 0.003600645888264381, "rougeL_recall": 0.18116114634018265, "rougeL_recall_stderr": 0.0032128881913064335, "rougeLsum_fmeasure": 0.1743815051388937, "rougeLsum_fmeasure_stderr": 0.003003683858669049, "rougeLsum_precision": 0.18664238451530393, "rougeLsum_precision_stderr": 0.0036151543248168095, "rougeLsum_recall": 0.18367415808096732, "rougeLsum_recall_stderr": 0.003324929336987573}}, "4": {"article_DOC_summary": {"bleu": 0.15163352455062296, "bleu_stderr": 0.0413547235971038, "rouge1_fmeasure": 0.059542504834570575, "rouge1_fmeasure_stderr": 0.0035683053141444022, "rouge1_precision": 0.06761052931830998, "rouge1_precision_stderr": 0.004168524274964794, "rouge1_recall": 0.05943339586169317, "rouge1_recall_stderr": 0.0036304617540668326, "rouge2_fmeasure": 0.014234207773434364, "rouge2_fmeasure_stderr": 0.0014260690361126657, "rouge2_precision": 0.01636682763621811, "rouge2_precision_stderr": 0.0016719863572741598, "rouge2_recall": 0.014189933840370804, "rouge2_recall_stderr": 0.0014657982637994635, "rougeL_fmeasure": 0.04594672257630698, "rougeL_fmeasure_stderr": 0.002810904537853349, "rougeL_precision": 0.052604784133526844, "rougeL_precision_stderr": 0.003336410205451067, "rougeL_recall": 0.045692475599885846, "rougeL_recall_stderr": 0.0028314510568171344, "rougeLsum_fmeasure": 0.04622428003631112, "rougeLsum_fmeasure_stderr": 0.0028346596378458684, "rougeLsum_precision": 0.05281145831744821, "rougeLsum_precision_stderr": 0.0033495714563258135, "rougeLsum_recall": 0.046242631863418045, "rougeLsum_recall_stderr": 0.002916590491225937}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d43ceb48c22837c2e284dc88e8dec4e95f34ce01
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.3236606671689831,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03434728960928616
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.07548519902353701,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0027103393867684133
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.27434436630785664,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005190735227370315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.09843971445320344,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020285748309311217
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.03280247893554901,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0015645885841163746
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1283696309182241,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0032411037301123703
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.044864078248862495,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012327331599678007
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.07235248106657086,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002585891559293976
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.26594645785931426,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.005067281777094968
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.09452977794835304,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019058829105986163
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.07204958395594553,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00260168307386312
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.2598611687840442,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004868835623139477
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.09372697520153284,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019257190223106488
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..57f7873969d5bbcecc7bf347f8c162f6a08382b7
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5303952344866372,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.03866628930999967
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.13500481273386208,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004456041975528417
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3050209335340558,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005019779051030469
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.15526776951779941,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0037112903723659145
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.06800836209907518,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029183003453498208
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.15616590508602413,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0035788033957169676
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07803113267822599,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0024911371039099254
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12100254673151754,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0038758351195453834
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.28545106347976323,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004617233759164798
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1409133691299396,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003170026813140542
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.12390505105203532,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003973176754882277
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.28858761694209856,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004669345842896859
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.14369743370526267,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003262924473269048
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd9f16be65d3d79712118f38fd20ea9640cce8f3
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.6963169812480917,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.025883064453583102
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1792256974533611,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005365719346439527
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.34488184545131717,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004970214155406786
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.19513258665465127,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0043495841141288575
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.0957315113405197,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0035283155722927633
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1834475236414602,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037698051706010482
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.10282617536971313,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002972081492632468
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.15875527177897067,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004689203205430283
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.31884399599505703,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004581869680066843
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1746470290433306,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003733649528603583
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.16348748083622414,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004839999383468051
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3238241924312652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0046243997645253
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.17894683258352004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003839954827867676
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb582b2957bef96e25a0412544a6a0b9933673c9
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.8274667802488902,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.042366964185806814
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.18955004198888442,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005662094291901046
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3533218112336572,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004960841139206212
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.20403726404125458,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004566219372488845
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10477428578899617,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003935236903263568
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1901585058979038,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0038815093874070626
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11040571352169745,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0032753908389439508
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.16706159999567868,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004913635515279735
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.32629206613585876,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004527254280455867
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18223468887125507,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003924418368935964
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17216966052247287,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005091094356122184
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3313273065171706,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045736991609824015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.18670844563664446,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0040383704320243125
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..45545788005ff8a2bff860a33886eb87d334a5ae
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.9591911553106375,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05993639067207907
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1924412284693008,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005450604205447632
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3705586639190707,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005109363173845363
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2120064135279467,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004550653208173271
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.10648010653830348,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00377122887055479
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.2022148169333304,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003998276489277708
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11494130887956767,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0032162324276510928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.16950864458355333,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004740831041491813
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34083637585044774,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004631727723114902
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18886918285090654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038962288174863224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.17511108437715225,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004905333361798361
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.34672946562667306,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00468590709544727
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1940229701192449,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00401611332459292
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c9857e6073bb2f0fd5ceb447e82566aa41e8748
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0660500946745477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.057054038562455854
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.2143907231679788,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00587683477313044
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.37935769424464766,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005114542762979336
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.22541411583738258,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0046964527830320015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.11999776807861791,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0040878065570314984
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.20846703611506925,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004052332137679142
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.12304296607701624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0033297218439898735
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.1874233346433946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0050850214855498194
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3480461100730932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004676777395335985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19952305726552913,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004002247434870572
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.19460128268928695,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005298127155329163
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3552748425353144,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004735986385918885
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.2058454997784591,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004150385211942694
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec2600c0139e88942cad542a26a393a723773f2e
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.1325770613482503,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0025192320948419973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.20000361049618887,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033883319122602765
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.14568887330505176,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024304012352800524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.030157012060673342,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009243132657339652
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04736363206911103,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014655640703641456
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03380900442536606,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009820401088417524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.10196660477211375,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0018957212855749772
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15791691922428616,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026796281489341964
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.11261046327516765,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017887243866971374
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.1229739908341885,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00235776548625545
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.18595911596519776,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003178608298694486
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.13507023756955555,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002258433462891094
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.36187522199539,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.13845059941627463
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb2a3730608b09ba980e63e0de5aaef49fd0c0e8
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.253166878261391,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00348988815268275
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.24765343018422936,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029563082706124282
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.2120115141592031,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002201889759622243
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.0699683530277847,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0020960827184197486
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06305007691775547,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015388476493559788
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.05447927752503051,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012618043977617596
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.19452273753171953,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00287285443881589
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.18824812828488288,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002306801191665917
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1602275848002253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001660994268757166
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.23777364194884903,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033469123908777755
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.2311323526084588,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00275352833642066
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.19811964314605882,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002066579949503973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.3670705128175316,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11604750071877634
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..079e771fb8eb01afb8258ac9e7edcb8ceb2fc0c9
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.3042280180045783,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0036548780493867858
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.2657722896569855,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002856010555746096
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.2423480992295036,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002213386982039197
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.09313769731615627,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002310899996162063
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.07567489802793778,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00165355417900699
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0696826972387138,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00138974451583577
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.23678748519073842,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030313841471723817
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.2052468833712137,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002285031155511772
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.18663522201860716,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001740118367633801
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.28748422717826916,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035473688052382567
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.24946630704943015,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026967966099014384
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.22792203815398218,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021140458556974436
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 4.453155894580253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07102095729922746
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..61eadbefb42e1f33f01e0d7d83089abd05e5aa6c
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2674845496647863,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0039889863741841
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.22160894672160433,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032468878599473164
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.20630036414089614,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026291569934361863
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08211313390771198,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0022759780285685387
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06436903346497445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016476213678321223
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.05970860054565866,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013699064127955533
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.20900633859522688,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0032474779669193566
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.17205068176936356,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002592219077108911
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.15931256883075381,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020301040819156045
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2532503820549326,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00383540860381719
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.2082619543002508,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00305787284903526
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.19413853066180423,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024798764798439764
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.3998244876185093,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07879957613227935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eeb97146a6c62b02abfc1334afe529a382d33aa5
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.092208189110962,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0034189214358844324
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.07224586073962391,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002716708158950956
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06668350472619872,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023223709305170443
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.0276141917171982,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016764660911735634
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.021177864192540377,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012371206877780988
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.01919932938843845,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010049481476114978
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07354514541641334,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002828654152146427
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.056562703185008854,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002168458322152195
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.05216028088524061,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018383425985936733
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.0875093402820744,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003281948169701787
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.06780620805618982,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025609050791819783
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.062781251763141,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002197731580536598
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.13483879286922137,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.011222185156728685
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc9a03320d08fa3e99598624e13db9da3c818c87
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.015272421462795964,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0015642398537380597
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.011340665089531033,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0012049966779817216
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.010526309548207395,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010240475142557462
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.004861708599486756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.000775800963972687
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.003729726793524184,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005770564824763594
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0032474649318922875,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00043456520285324997
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.012655622009735225,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001358908157095457
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.009170085952684662,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0009929814693818242
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008507384832807946,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00083698404273872
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.01465587941098198,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001510887749703006
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.01078936789465723,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0011506077585367758
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.010004236605713649,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009708230001026674
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.1051284504073666e-13,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 2.6395896486810652e-12
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7cd10935215d82f9abdd832e7b938a0583d41ea
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b877d8145af536356df2e49e9246143a3513149
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 11.807245592727982,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14268734921118004
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5694397994427649,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003239379072871242
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.43330345866585573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003044911625941733
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.46557148222586153,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023657331986333577
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2723840046490374,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027516718471111436
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.20361914910986256,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022039744834451503
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.21912991483763275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002069315036688718
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4186095218334134,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002968616014528494
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3149916119112548,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024715985756080345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.33955869538197114,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002087373657004864
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.46699348061463236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003191007134428128
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.35381509149376406,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002783989467610118
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.38066023518841285,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023459300318536126
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6b144bbb066febb0c46d18b93f8336cec539a71
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.207475603022154,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20911303585808974
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5838609934278248,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003219893788304826
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4676453406199286,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003012959553547953
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.49425084203142144,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023454913129034973
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2920964834179201,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002749941677148143
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23229590203180275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002340599122551353
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2450476239671336,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021622871202404523
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4326595154400953,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029553909822487533
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34449565433901846,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002541682059011724
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3646666089186555,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021523804779859127
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48625392335924433,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031665719502181147
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3893023276736312,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028360977602737624
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.41151419710242576,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023890835672487213
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bcd8c63791228dbba0c7ad4181bef6a1f633719
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.881355021152629,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14798982038506134
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5813338263183422,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031068563548781497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47562332710532934,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029492958281890696
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5001074662527342,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022864719896120603
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2933839366604696,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002656083464844314
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23916505911615024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023564329861628973
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2506081474778741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021353634161434465
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.42886203590947314,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028648415383139367
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34942359580045534,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025299540186551716
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.36772667157315414,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002137446714056548
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48630515550742015,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030844914644101774
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3982270500785823,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028293510736079205
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.41854738428796234,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023767509258669573
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce4629eb162c382559f8e035d870e6a37ce5e656
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.161167960986678,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1434464937691823
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5828889415157229,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003147155952785506
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.47816618968937713,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028905507376413963
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5031994429048829,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002288693420421454
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2947392128854523,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002705329023608755
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24017998180506026,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023323057167825125
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2523951605760761,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021595061752098924
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43041916996749047,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002887136906682548
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.352331798953218,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025196691672468205
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37084325448755734,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002169564896258564
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.48846331616257255,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003116275576376514
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4014433365412298,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028080914024126535
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.42211119537556474,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002395568997982313
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c263bee4edac771ceb0f544ab48a204424cacc95
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_e2e_nlg_cleaned_generate_text_restaurant_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.199579399737006,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2197710642602159
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5888360095441599,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003159796703393573
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4770950472138781,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028673153056796336
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5056858022442728,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022854558866388314
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2996637182544266,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002755313978179728
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24042124929715428,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023404910908957914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25480077699734965,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002191468155403316
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4337481481034359,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029181057449224846
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35012226658269135,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002503515068580577
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3714282516661889,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021747599500282675
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.49452192644513626,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031483071698951075
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4006792202843592,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002780394897574505
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4246436350463985,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023956383797766666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_0.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5043111707438f6e99fba55c03da6eeff78cdd26
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.1442510190731307,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.002377555886882021
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.32728957236911,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.005137969407531113
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.19658526341354268,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0030181310698989858
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0341581223240638,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011714333255302305
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.08176991191725373,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0028112899026499264
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04748504386381342,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016110390499460413
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.10623987893735275,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0018433462761128848
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.24157333634286496,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003878678496036975
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.14457182138352206,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002237374752785832
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.11372296782139678,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001999008147870968
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.25936158541657245,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004311577104350546
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15498205908882362,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024811396605881834
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.051690852375094,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0919729448596882
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_1.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..724b8d3cfe86310d48d5a902370254cf07f1fd48
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.21346282259038915,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004034521833789361
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2499714791367549,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004127977259047798
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21363061590069807,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033936589765782323
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0463678167671605,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0022751074397479123
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.052694807564007726,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002334059403839543
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0455477081115806,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0020625955375166345
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1620749435386503,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003255513503258099
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18819404884414812,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0031630393650993625
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1613719463559732,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027032991504210456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.16522714514641704,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0032476758014330694
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.195040375695118,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003405905178416842
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.16555112242432596,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027468382198912893
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.019074157200998,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1854543722412079
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_2.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef8d3a00737c54523ac3cac3788bf65a76fdc17c
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.23937462775757284,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004035761510891073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.25228250851051215,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038577616272618006
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23064419705209097,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003326199002008987
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.05306913769010263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002351977805815094
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05441248374509546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002305037996521187
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05025303080761008,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002124751664870315
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.1812036165079183,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0032724786112898127
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.1907813012976707,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030970666700906747
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1742531731484881,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0026993941796824017
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.1831569851914895,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0032779593192606743
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.19469605892775696,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003266640672239043
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.17671769275213156,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002734700050326904
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.382425295806561,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.185437417569327
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_3.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..158f496759455052fe55d3a175f34734afc3c4db
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.24352976537606963,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004350047350541235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.23908129808521703,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004002524315183232
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.22778807117999614,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0036243774679184364
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.05729292572552575,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002617653205209828
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05451017420559761,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002306325013293872
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0527365635483454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002240194553760494
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.18538163733792024,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003600645888264381
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18116114634018265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032128881913064335
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.17283311820835626,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0029786990259966836
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.18664238451530393,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0036151543248168095
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.18367415808096732,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003324929336987573
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1743815051388937,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003003683858669049
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.8525428900629293,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1568990466406635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_4.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8dbfe332891d765b81767e1662bf08e465f8970
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.06761052931830998,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004168524274964794
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.05943339586169317,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0036304617540668326
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.059542504834570575,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0035683053141444022
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.01636682763621811,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0016719863572741598
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.014189933840370804,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014657982637994635
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.014234207773434364,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014260690361126657
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.052604784133526844,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003336410205451067
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.045692475599885846,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028314510568171344
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04594672257630698,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002810904537853349
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05281145831744821,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033495714563258135
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.046242631863418045,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002916590491225937
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04622428003631112,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028346596378458684
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.15163352455062296,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0413547235971038
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_5.json b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0efd886b57c1bd8fb51598ae66447f081e4ac7d1
--- /dev/null
+++ b/4b284b28boscar/evaluation/generation/slim.4b284b28boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b28boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_0.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf264f5abf3f7e8b9107bc74b5b6e619445e1052
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_0.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r2": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r3": {
+            "acc": 0.3408333333333333,
+            "acc_stderr": 0.013688600793296939
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.1818181818181818
+        },
+        "copa": {
+            "acc": 0.77,
+            "acc_stderr": 0.04229525846816506
+        },
+        "hellaswag": {
+            "acc": 0.4060944035052778,
+            "acc_stderr": 0.004900988997414227,
+            "acc_norm": 0.5160326628161721,
+            "acc_norm_stderr": 0.004987215542259667
+        },
+        "rte": {
+            "acc": 0.5487364620938628,
+            "acc_stderr": 0.029953149241808946
+        },
+        "winogrande": {
+            "acc": 0.5430149960536701,
+            "acc_stderr": 0.01400038676159829
+        },
+        "storycloze_2016": {
+            "acc": 0.6739711384286478,
+            "acc_stderr": 0.010839964752045184
+        },
+        "boolq": {
+            "acc": 0.5318042813455658,
+            "acc_stderr": 0.008727345583419184
+        },
+        "arc_easy": {
+            "acc": 0.5694444444444444,
+            "acc_stderr": 0.010160345396860075,
+            "acc_norm": 0.5151515151515151,
+            "acc_norm_stderr": 0.010255071794531504
+        },
+        "arc_challenge": {
+            "acc": 0.24744027303754265,
+            "acc_stderr": 0.01261035266329267,
+            "acc_norm": 0.2858361774744027,
+            "acc_norm_stderr": 0.013203196088537369
+        },
+        "sciq": {
+            "acc": 0.833,
+            "acc_stderr": 0.011800434324644594,
+            "acc_norm": 0.754,
+            "acc_norm_stderr": 0.013626065817750636
+        },
+        "piqa": {
+            "acc": 0.7219804134929271,
+            "acc_stderr": 0.010453117358332802,
+            "acc_norm": 0.7247007616974973,
+            "acc_norm_stderr": 0.01042142927736953
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf264f5abf3f7e8b9107bc74b5b6e619445e1052
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_0_lm-eval_global_step80108_2023-01-30-19-47-03_0shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r2": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r3": {
+            "acc": 0.3408333333333333,
+            "acc_stderr": 0.013688600793296939
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.1818181818181818
+        },
+        "copa": {
+            "acc": 0.77,
+            "acc_stderr": 0.04229525846816506
+        },
+        "hellaswag": {
+            "acc": 0.4060944035052778,
+            "acc_stderr": 0.004900988997414227,
+            "acc_norm": 0.5160326628161721,
+            "acc_norm_stderr": 0.004987215542259667
+        },
+        "rte": {
+            "acc": 0.5487364620938628,
+            "acc_stderr": 0.029953149241808946
+        },
+        "winogrande": {
+            "acc": 0.5430149960536701,
+            "acc_stderr": 0.01400038676159829
+        },
+        "storycloze_2016": {
+            "acc": 0.6739711384286478,
+            "acc_stderr": 0.010839964752045184
+        },
+        "boolq": {
+            "acc": 0.5318042813455658,
+            "acc_stderr": 0.008727345583419184
+        },
+        "arc_easy": {
+            "acc": 0.5694444444444444,
+            "acc_stderr": 0.010160345396860075,
+            "acc_norm": 0.5151515151515151,
+            "acc_norm_stderr": 0.010255071794531504
+        },
+        "arc_challenge": {
+            "acc": 0.24744027303754265,
+            "acc_stderr": 0.01261035266329267,
+            "acc_norm": 0.2858361774744027,
+            "acc_norm_stderr": 0.013203196088537369
+        },
+        "sciq": {
+            "acc": 0.833,
+            "acc_stderr": 0.011800434324644594,
+            "acc_norm": 0.754,
+            "acc_norm_stderr": 0.013626065817750636
+        },
+        "piqa": {
+            "acc": 0.7219804134929271,
+            "acc_stderr": 0.010453117358332802,
+            "acc_norm": 0.7247007616974973,
+            "acc_norm_stderr": 0.01042142927736953
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_1.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cc176aaf1930ddb4dc2fd397be1ac13277669da
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_1.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.336,
+            "acc_stderr": 0.014944140233795023
+        },
+        "anli_r2": {
+            "acc": 0.326,
+            "acc_stderr": 0.014830507204541037
+        },
+        "anli_r3": {
+            "acc": 0.33,
+            "acc_stderr": 0.013579531277800922
+        },
+        "cb": {
+            "acc": 0.25,
+            "acc_stderr": 0.058387420812114225,
+            "f1": 0.2095321637426901
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.045126085985421276
+        },
+        "hellaswag": {
+            "acc": 0.4051981676956781,
+            "acc_stderr": 0.004899270310557984,
+            "acc_norm": 0.5231029675363473,
+            "acc_norm_stderr": 0.004984452002563928
+        },
+        "rte": {
+            "acc": 0.49097472924187724,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5469613259668509,
+            "acc_stderr": 0.013990366632148104
+        },
+        "storycloze_2016": {
+            "acc": 0.6632816675574559,
+            "acc_stderr": 0.010928525619392455
+        },
+        "boolq": {
+            "acc": 0.57217125382263,
+            "acc_stderr": 0.008653474894637182
+        },
+        "arc_easy": {
+            "acc": 0.5854377104377104,
+            "acc_stderr": 0.010108889212447769,
+            "acc_norm": 0.5723905723905723,
+            "acc_norm_stderr": 0.010151683397430677
+        },
+        "arc_challenge": {
+            "acc": 0.2721843003412969,
+            "acc_stderr": 0.013006600406423707,
+            "acc_norm": 0.29436860068259385,
+            "acc_norm_stderr": 0.013318528460539422
+        },
+        "sciq": {
+            "acc": 0.891,
+            "acc_stderr": 0.009859828407037188,
+            "acc_norm": 0.883,
+            "acc_norm_stderr": 0.010169287802713327
+        },
+        "piqa": {
+            "acc": 0.721436343852013,
+            "acc_stderr": 0.010459397235965182,
+            "acc_norm": 0.719260065288357,
+            "acc_norm_stderr": 0.010484325438311827
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cc176aaf1930ddb4dc2fd397be1ac13277669da
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_1_lm-eval_global_step80108_2023-01-30-19-47-03_1shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.336,
+            "acc_stderr": 0.014944140233795023
+        },
+        "anli_r2": {
+            "acc": 0.326,
+            "acc_stderr": 0.014830507204541037
+        },
+        "anli_r3": {
+            "acc": 0.33,
+            "acc_stderr": 0.013579531277800922
+        },
+        "cb": {
+            "acc": 0.25,
+            "acc_stderr": 0.058387420812114225,
+            "f1": 0.2095321637426901
+        },
+        "copa": {
+            "acc": 0.72,
+            "acc_stderr": 0.045126085985421276
+        },
+        "hellaswag": {
+            "acc": 0.4051981676956781,
+            "acc_stderr": 0.004899270310557984,
+            "acc_norm": 0.5231029675363473,
+            "acc_norm_stderr": 0.004984452002563928
+        },
+        "rte": {
+            "acc": 0.49097472924187724,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5469613259668509,
+            "acc_stderr": 0.013990366632148104
+        },
+        "storycloze_2016": {
+            "acc": 0.6632816675574559,
+            "acc_stderr": 0.010928525619392455
+        },
+        "boolq": {
+            "acc": 0.57217125382263,
+            "acc_stderr": 0.008653474894637182
+        },
+        "arc_easy": {
+            "acc": 0.5854377104377104,
+            "acc_stderr": 0.010108889212447769,
+            "acc_norm": 0.5723905723905723,
+            "acc_norm_stderr": 0.010151683397430677
+        },
+        "arc_challenge": {
+            "acc": 0.2721843003412969,
+            "acc_stderr": 0.013006600406423707,
+            "acc_norm": 0.29436860068259385,
+            "acc_norm_stderr": 0.013318528460539422
+        },
+        "sciq": {
+            "acc": 0.891,
+            "acc_stderr": 0.009859828407037188,
+            "acc_norm": 0.883,
+            "acc_norm_stderr": 0.010169287802713327
+        },
+        "piqa": {
+            "acc": 0.721436343852013,
+            "acc_stderr": 0.010459397235965182,
+            "acc_norm": 0.719260065288357,
+            "acc_norm_stderr": 0.010484325438311827
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_2.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd6699519eec959472d5b18d393649a329ef990f
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_2.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.341,
+            "acc_stderr": 0.014998131348402707
+        },
+        "anli_r2": {
+            "acc": 0.343,
+            "acc_stderr": 0.015019206922356953
+        },
+        "anli_r3": {
+            "acc": 0.32666666666666666,
+            "acc_stderr": 0.013544340907003665
+        },
+        "cb": {
+            "acc": 0.25,
+            "acc_stderr": 0.058387420812114225,
+            "f1": 0.2075098814229249
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394
+        },
+        "hellaswag": {
+            "acc": 0.40728938458474406,
+            "acc_stderr": 0.0049032542641776235,
+            "acc_norm": 0.5259908384783908,
+            "acc_norm_stderr": 0.004983035420235712
+        },
+        "rte": {
+            "acc": 0.5054151624548736,
+            "acc_stderr": 0.030094698123239966
+        },
+        "winogrande": {
+            "acc": 0.5516969218626677,
+            "acc_stderr": 0.013977171307126345
+        },
+        "storycloze_2016": {
+            "acc": 0.6675574559059326,
+            "acc_stderr": 0.010893860778343539
+        },
+        "boolq": {
+            "acc": 0.5795107033639144,
+            "acc_stderr": 0.008633775332463619
+        },
+        "arc_easy": {
+            "acc": 0.6031144781144782,
+            "acc_stderr": 0.010039236800583206,
+            "acc_norm": 0.5858585858585859,
+            "acc_norm_stderr": 0.01010738767300251
+        },
+        "arc_challenge": {
+            "acc": 0.25853242320819114,
+            "acc_stderr": 0.012794553754288692,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.013385021637313572
+        },
+        "sciq": {
+            "acc": 0.902,
+            "acc_stderr": 0.00940661918462124,
+            "acc_norm": 0.9,
+            "acc_norm_stderr": 0.009491579957525044
+        },
+        "piqa": {
+            "acc": 0.7219804134929271,
+            "acc_stderr": 0.010453117358332795,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.010434162388275608
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd6699519eec959472d5b18d393649a329ef990f
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_2_lm-eval_global_step80108_2023-01-30-19-47-03_2shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.341,
+            "acc_stderr": 0.014998131348402707
+        },
+        "anli_r2": {
+            "acc": 0.343,
+            "acc_stderr": 0.015019206922356953
+        },
+        "anli_r3": {
+            "acc": 0.32666666666666666,
+            "acc_stderr": 0.013544340907003665
+        },
+        "cb": {
+            "acc": 0.25,
+            "acc_stderr": 0.058387420812114225,
+            "f1": 0.2075098814229249
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394
+        },
+        "hellaswag": {
+            "acc": 0.40728938458474406,
+            "acc_stderr": 0.0049032542641776235,
+            "acc_norm": 0.5259908384783908,
+            "acc_norm_stderr": 0.004983035420235712
+        },
+        "rte": {
+            "acc": 0.5054151624548736,
+            "acc_stderr": 0.030094698123239966
+        },
+        "winogrande": {
+            "acc": 0.5516969218626677,
+            "acc_stderr": 0.013977171307126345
+        },
+        "storycloze_2016": {
+            "acc": 0.6675574559059326,
+            "acc_stderr": 0.010893860778343539
+        },
+        "boolq": {
+            "acc": 0.5795107033639144,
+            "acc_stderr": 0.008633775332463619
+        },
+        "arc_easy": {
+            "acc": 0.6031144781144782,
+            "acc_stderr": 0.010039236800583206,
+            "acc_norm": 0.5858585858585859,
+            "acc_norm_stderr": 0.01010738767300251
+        },
+        "arc_challenge": {
+            "acc": 0.25853242320819114,
+            "acc_stderr": 0.012794553754288692,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.013385021637313572
+        },
+        "sciq": {
+            "acc": 0.902,
+            "acc_stderr": 0.00940661918462124,
+            "acc_norm": 0.9,
+            "acc_norm_stderr": 0.009491579957525044
+        },
+        "piqa": {
+            "acc": 0.7219804134929271,
+            "acc_stderr": 0.010453117358332795,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.010434162388275608
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_3.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..134abbcbd63be4318c5a9787d36c7cc397b93ddf
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_3.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.332,
+            "acc_stderr": 0.01489959724281149
+        },
+        "anli_r2": {
+            "acc": 0.362,
+            "acc_stderr": 0.015204840912919503
+        },
+        "anli_r3": {
+            "acc": 0.33416666666666667,
+            "acc_stderr": 0.013622434813136788
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.4217687074829932
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.40420235012945627,
+            "acc_stderr": 0.004897340793314381,
+            "acc_norm": 0.5269866560446126,
+            "acc_norm_stderr": 0.004982508198584267
+        },
+        "rte": {
+            "acc": 0.5776173285198556,
+            "acc_stderr": 0.02973162264649588
+        },
+        "winogrande": {
+            "acc": 0.5351223362273086,
+            "acc_stderr": 0.014017773120881585
+        },
+        "storycloze_2016": {
+            "acc": 0.6675574559059326,
+            "acc_stderr": 0.01089386077834354
+        },
+        "boolq": {
+            "acc": 0.5688073394495413,
+            "acc_stderr": 0.008661853128165595
+        },
+        "arc_easy": {
+            "acc": 0.6014309764309764,
+            "acc_stderr": 0.010046455400477943,
+            "acc_norm": 0.585016835016835,
+            "acc_norm_stderr": 0.01011038315196114
+        },
+        "arc_challenge": {
+            "acc": 0.28071672354948807,
+            "acc_stderr": 0.013131238126975578,
+            "acc_norm": 0.3046075085324232,
+            "acc_norm_stderr": 0.013449522109932489
+        },
+        "sciq": {
+            "acc": 0.918,
+            "acc_stderr": 0.008680515615523727,
+            "acc_norm": 0.908,
+            "acc_norm_stderr": 0.009144376393151098
+        },
+        "piqa": {
+            "acc": 0.7274211099020674,
+            "acc_stderr": 0.010389256803296023,
+            "acc_norm": 0.7290533188248096,
+            "acc_norm_stderr": 0.010369718937426844
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..134abbcbd63be4318c5a9787d36c7cc397b93ddf
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_3_lm-eval_global_step80108_2023-01-30-19-47-03_3shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.332,
+            "acc_stderr": 0.01489959724281149
+        },
+        "anli_r2": {
+            "acc": 0.362,
+            "acc_stderr": 0.015204840912919503
+        },
+        "anli_r3": {
+            "acc": 0.33416666666666667,
+            "acc_stderr": 0.013622434813136788
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.4217687074829932
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.40420235012945627,
+            "acc_stderr": 0.004897340793314381,
+            "acc_norm": 0.5269866560446126,
+            "acc_norm_stderr": 0.004982508198584267
+        },
+        "rte": {
+            "acc": 0.5776173285198556,
+            "acc_stderr": 0.02973162264649588
+        },
+        "winogrande": {
+            "acc": 0.5351223362273086,
+            "acc_stderr": 0.014017773120881585
+        },
+        "storycloze_2016": {
+            "acc": 0.6675574559059326,
+            "acc_stderr": 0.01089386077834354
+        },
+        "boolq": {
+            "acc": 0.5688073394495413,
+            "acc_stderr": 0.008661853128165595
+        },
+        "arc_easy": {
+            "acc": 0.6014309764309764,
+            "acc_stderr": 0.010046455400477943,
+            "acc_norm": 0.585016835016835,
+            "acc_norm_stderr": 0.01011038315196114
+        },
+        "arc_challenge": {
+            "acc": 0.28071672354948807,
+            "acc_stderr": 0.013131238126975578,
+            "acc_norm": 0.3046075085324232,
+            "acc_norm_stderr": 0.013449522109932489
+        },
+        "sciq": {
+            "acc": 0.918,
+            "acc_stderr": 0.008680515615523727,
+            "acc_norm": 0.908,
+            "acc_norm_stderr": 0.009144376393151098
+        },
+        "piqa": {
+            "acc": 0.7274211099020674,
+            "acc_stderr": 0.010389256803296023,
+            "acc_norm": 0.7290533188248096,
+            "acc_norm_stderr": 0.010369718937426844
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_4.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8e7d2806010d11e2bc3fc26aaee8da3b8ac76ac
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_4.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.336,
+            "acc_stderr": 0.014944140233795027
+        },
+        "anli_r2": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r3": {
+            "acc": 0.3516666666666667,
+            "acc_stderr": 0.013789711695404801
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.33413848631239934
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4049990041824338,
+            "acc_stderr": 0.004898886080687925,
+            "acc_norm": 0.5279824736108345,
+            "acc_norm_stderr": 0.004981961097590808
+        },
+        "rte": {
+            "acc": 0.49097472924187724,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5422257300710339,
+            "acc_stderr": 0.014002284504422438
+        },
+        "storycloze_2016": {
+            "acc": 0.6734366648850882,
+            "acc_stderr": 0.010844543793668893
+        },
+        "boolq": {
+            "acc": 0.5605504587155963,
+            "acc_stderr": 0.008680693125810188
+        },
+        "arc_easy": {
+            "acc": 0.6064814814814815,
+            "acc_stderr": 0.010024426884292557,
+            "acc_norm": 0.5917508417508418,
+            "acc_norm_stderr": 0.010085566195791252
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.012835523909473847,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.013385021637313572
+        },
+        "sciq": {
+            "acc": 0.915,
+            "acc_stderr": 0.00882342636694232,
+            "acc_norm": 0.911,
+            "acc_norm_stderr": 0.009008893392651525
+        },
+        "piqa": {
+            "acc": 0.7165397170837867,
+            "acc_stderr": 0.010515057791152076,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.01043416238827561
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8e7d2806010d11e2bc3fc26aaee8da3b8ac76ac
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_4_lm-eval_global_step80108_2023-01-30-19-47-03_4shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.336,
+            "acc_stderr": 0.014944140233795027
+        },
+        "anli_r2": {
+            "acc": 0.34,
+            "acc_stderr": 0.014987482264363937
+        },
+        "anli_r3": {
+            "acc": 0.3516666666666667,
+            "acc_stderr": 0.013789711695404801
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.33413848631239934
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4049990041824338,
+            "acc_stderr": 0.004898886080687925,
+            "acc_norm": 0.5279824736108345,
+            "acc_norm_stderr": 0.004981961097590808
+        },
+        "rte": {
+            "acc": 0.49097472924187724,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5422257300710339,
+            "acc_stderr": 0.014002284504422438
+        },
+        "storycloze_2016": {
+            "acc": 0.6734366648850882,
+            "acc_stderr": 0.010844543793668893
+        },
+        "boolq": {
+            "acc": 0.5605504587155963,
+            "acc_stderr": 0.008680693125810188
+        },
+        "arc_easy": {
+            "acc": 0.6064814814814815,
+            "acc_stderr": 0.010024426884292557,
+            "acc_norm": 0.5917508417508418,
+            "acc_norm_stderr": 0.010085566195791252
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.012835523909473847,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.013385021637313572
+        },
+        "sciq": {
+            "acc": 0.915,
+            "acc_stderr": 0.00882342636694232,
+            "acc_norm": 0.911,
+            "acc_norm_stderr": 0.009008893392651525
+        },
+        "piqa": {
+            "acc": 0.7165397170837867,
+            "acc_stderr": 0.010515057791152076,
+            "acc_norm": 0.7236126224156693,
+            "acc_norm_stderr": 0.01043416238827561
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_5.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..56a72d916be2a03f0f7f8bb6ee8a52b8fbe4fa1c
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_5.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.359,
+            "acc_stderr": 0.0151772642247986
+        },
+        "anli_r2": {
+            "acc": 0.363,
+            "acc_stderr": 0.015213890444671283
+        },
+        "anli_r3": {
+            "acc": 0.3358333333333333,
+            "acc_stderr": 0.013639261190932887
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.3307297277885513
+        },
+        "copa": {
+            "acc": 0.7,
+            "acc_stderr": 0.046056618647183814
+        },
+        "hellaswag": {
+            "acc": 0.40599482174865564,
+            "acc_stderr": 0.004900798868048132,
+            "acc_norm": 0.5313682533359888,
+            "acc_norm_stderr": 0.004979952166595542
+        },
+        "rte": {
+            "acc": 0.5270758122743683,
+            "acc_stderr": 0.030052303463143706
+        },
+        "winogrande": {
+            "acc": 0.5461720599842147,
+            "acc_stderr": 0.01399244156370707
+        },
+        "storycloze_2016": {
+            "acc": 0.6766435061464458,
+            "acc_stderr": 0.010816828633068225
+        },
+        "boolq": {
+            "acc": 0.5620795107033639,
+            "acc_stderr": 0.008677388652709263
+        },
+        "arc_easy": {
+            "acc": 0.5963804713804713,
+            "acc_stderr": 0.010067368960348216,
+            "acc_norm": 0.5904882154882155,
+            "acc_norm_stderr": 0.010090368160990062
+        },
+        "arc_challenge": {
+            "acc": 0.2764505119453925,
+            "acc_stderr": 0.013069662474252428,
+            "acc_norm": 0.310580204778157,
+            "acc_norm_stderr": 0.013522292098053055
+        },
+        "sciq": {
+            "acc": 0.917,
+            "acc_stderr": 0.00872852720607479,
+            "acc_norm": 0.912,
+            "acc_norm_stderr": 0.008963053962592081
+        },
+        "piqa": {
+            "acc": 0.720892274211099,
+            "acc_stderr": 0.010465657948498228,
+            "acc_norm": 0.7274211099020674,
+            "acc_norm_stderr": 0.010389256803296009
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/evaluation/rankeval/4b284b28boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..56a72d916be2a03f0f7f8bb6ee8a52b8fbe4fa1c
--- /dev/null
+++ b/4b284b28boscar/evaluation/rankeval/4b284b28boscar_5_lm-eval_global_step80108_2023-01-30-19-47-03_5shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.359,
+            "acc_stderr": 0.0151772642247986
+        },
+        "anli_r2": {
+            "acc": 0.363,
+            "acc_stderr": 0.015213890444671283
+        },
+        "anli_r3": {
+            "acc": 0.3358333333333333,
+            "acc_stderr": 0.013639261190932887
+        },
+        "cb": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.0658538889806635,
+            "f1": 0.3307297277885513
+        },
+        "copa": {
+            "acc": 0.7,
+            "acc_stderr": 0.046056618647183814
+        },
+        "hellaswag": {
+            "acc": 0.40599482174865564,
+            "acc_stderr": 0.004900798868048132,
+            "acc_norm": 0.5313682533359888,
+            "acc_norm_stderr": 0.004979952166595542
+        },
+        "rte": {
+            "acc": 0.5270758122743683,
+            "acc_stderr": 0.030052303463143706
+        },
+        "winogrande": {
+            "acc": 0.5461720599842147,
+            "acc_stderr": 0.01399244156370707
+        },
+        "storycloze_2016": {
+            "acc": 0.6766435061464458,
+            "acc_stderr": 0.010816828633068225
+        },
+        "boolq": {
+            "acc": 0.5620795107033639,
+            "acc_stderr": 0.008677388652709263
+        },
+        "arc_easy": {
+            "acc": 0.5963804713804713,
+            "acc_stderr": 0.010067368960348216,
+            "acc_norm": 0.5904882154882155,
+            "acc_norm_stderr": 0.010090368160990062
+        },
+        "arc_challenge": {
+            "acc": 0.2764505119453925,
+            "acc_stderr": 0.013069662474252428,
+            "acc_norm": 0.310580204778157,
+            "acc_norm_stderr": 0.013522292098053055
+        },
+        "sciq": {
+            "acc": 0.917,
+            "acc_stderr": 0.00872852720607479,
+            "acc_norm": 0.912,
+            "acc_norm_stderr": 0.008963053962592081
+        },
+        "piqa": {
+            "acc": 0.720892274211099,
+            "acc_stderr": 0.010465657948498228,
+            "acc_norm": 0.7274211099020674,
+            "acc_norm_stderr": 0.010389256803296009
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b411fb3df81494aab99443ccd63943c330d18b6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0733094d0117a10d7654a7c16829f058990c0cef5e01b773b5d0700e6b1cb1b7
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ace11a9b2533c8e3efe2af01e612ae61a0ade369
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:611daff5ec8ac7d894e03ced13736c355976d3b7973f2521cd2db99bfcd2b6ea
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2d2cc1d383d3fe3d53b3a39bf1b93f6e69875ff6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8841afb91d6814ab82d9785544e8d5057b9326c8ee240b64a4c64ae648ddaf
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3d527875934844bd2bb7e136590981bcb01c9d71
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c81bf02f56a57416a11a74260f54e40945a76748e815541d72ce776777b0572a
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac74556fbe2b623eed30b6e30336369c9a749fb0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49e1d2f605efc1dfb5e74eca8be677fc609a5ea3ff814a013914f2a813915f92
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1af60f188a8e3e3223db80de315bba7064da6ed5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43b8ec4532c99ba08fb6770f09f0633a291379a97fa45221d213763bf31afcda
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4cb6cf0a4c42dafb637a5e6b843c9260a6481058
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6c1e1f9afdcea2e3a6d9ea580ebabd3268e1baa50cf2fc766396441f73f1c39
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..89766f461a5367218d9f9f0d1ffb78617b6b7aef
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8f4516af3bf0ef0dc16fde319ac54807809dd67cf0d5b9382576e0230b955d
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..825592d63c22159b21529935d51b57bb52b1e070
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bc2b9d194c05e7fd0c15730b9a9177a1fef68d418ffab60680f624e26e4101c
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fff82b9fe4a37a8a215b17c060f7a6c88831e614
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e6f0aa7cae9de7f4178cc07260fb9024ea38292250375688de1334ad1b8c060
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0294d03530ef6ecddb7446aa752294d9327f1da1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af04d2e8085b6fb48e4ca4720614bd83cb48a3c7ada9397f37552180fd3ac09e
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9f9054b8371e4dcef5337f8e88dd1973b60e4618
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:136251ecc8700c443fb412e05ce561829f49fa5b117ffe257459f1a00fd6402f
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fc7514a68c83a37c0e6523066bfe106f5e9f2abd
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4030677233ec793f4b05353df7f60990ccc9b77b02060c1ead055ccdf061130c
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0870979f02f7add0e93bcfcbc04691943859c1d5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2848de7518794768e8b4eb638eb103aec1b622e62a54185cf77c3937d16f3fd7
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9dd1b2701abfc56daf7f7c5c29f36e393f901669
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2283f5232a1608b7b4bf8bef8ad5b11aef3ef72ff582a338c5a27e595b2f01c7
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c2963651fa057522173a8e9673cd33ce0bf67e6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4faa3c07605286a414aea4b954249b2cd3e0abc9c999d26c23cbe9dd8a465556
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..259d8396dcfe6aa4864a710b78050ebd1615a995
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14d0be2753c6f15197cfc9791b8fac0efa23137d94ccdaf028fb14967ec662bd
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1dbd02d4979f1fa6bcafbb2ed53ed239ca340adb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4147cffbefc845ccb8041ba84e65970fede070a7028385c7cdf909ee92ff7509
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..771dd4b6c26e0ed4c770cffb8f2f0c8ea01d3592
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f244c4b06d7569fe022dd13e77d70a45e5c92a9adccb675e650c41844af6010c
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..989ecc37a68174dcff82e911819827fad59a7723
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a71fa09b3b7057ce86354fa8801868d61645b89ba8dcd413e64df06552d421d3
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e48dad50ae32bfa534923dfea1ae29ef52024366
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:730d69fbd20dd898c33a55e38bcc6d5dbeb696172e6f40b9676e6abdd2db77b9
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6d493a149fe1a61c23655d19c4b4292440669cb3
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b31849b46e6e85317070774e686d168109a3f071e15b1220ff78e61054818078
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..716f8f6d0b8493482ecdb4582a491d1f3ccedbd0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5586fffa7f68d4d58d9e5b338d4411ebd44c4271445852df10934878f8ef8ea
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68e8e4a58022173554ed373d8a694b27cd7b7488
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3eea4384f23c23d0e1727563ddd0b0e72b20c1eb0efba9d2acf96a9c192520c
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f8feb7744a53049707347a2bd5d1f894085225f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5375f3f06c898202fa2318a48682ac4d6e886cae35f39e37a79412260171864
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..20dc6a789eebda9270f656d1b3c7c009de5600ea
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1c1f2d5d4e12a8b91828247fd6fc1047746159f7950f9796b707aa25301a7ec
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c2453af70c81d5ec9efcf81dc9f0b3c4c91cfe03
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:122ce43bbbe794ac7937ab11ae345d25d93a0b38ec9ff8a8b7f619fa0b534238
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ab32a3655069e0424f7bea400cf019546f0a36d6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:074de16cad0f53eb2888815b5e75219bbc78cf1dc66fa5ca8b9223b6fc3bae28
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e15b9cc3f2162b96f2ce98a5ed7879ef5bca5b2b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d542a886c14411cddddc94d4f713785098a33697c6b93e47f59c9c1c345f5e31
+size 199058605
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09918e466406087f92158e2ca2987ce65c87a82e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f38e2479ae61de5470e1443e5e5d5db5fc65466e7b9a6f63720e9f1f69d225b0
+size 199058605
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1730dfafe88d9c11df09c72de849116a013d3c43
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a0688a9321a1c1098be4b5c6f80535e9ac9f417ba3431558b259bd6cef27b7a
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c8d443f180173b9df948807e888a1553ce403433
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d53c0d312ab7138337a604aba26eca4010fd591c7d5f54de17547b74a4af89b
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..72e1d7c3f2e7a33b3c2ef6f004c42b8355336c79
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:925e466b59490c45151e77e1a13e10c2f50784b942b8d9b20d66306eef3eda39
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..793f132679b5b8c0194f10dfdf29eaa523831f12
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:538cc0d5277670480fe56aafd16873f6638dbae9efb9449859aeb315899f2bb9
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..525830580d6fec590d41906eaf7f20d99457224f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f6cf780f53a19664d1c6c15d30e3f77da07da7165267f0bd1654e0a153e42ea
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68483b41c27a00c34860745967fc1f9d90b2644d
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f0d789a31c538729fead98a0663ce8cea3637108201af7d8c3cb6dafc5f1694
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0ad9032618cda01ec91745cd0ea190334b7c47d7
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd2149411325f3311aa002add857b016ecd67c1de5120fd2f755d0d141f0c6b7
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ceff54ffc7cf7235b70cb0d4adb7b623e869198
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4bf2dd2fb338dd3c56a341f3958d2d8111e802a3a4e998111abf4096ff0398b
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c632b14a8b10b445748126dc47c63846dc745b65
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1494b4e08743caec020d41700098e6be604b5d40c4819100923806da566eb3a7
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5cd3d7d7fb01a7f36e5dd0e40015a8268b77be8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d88eed17280cc6148dc4de9333b45c263e2994cd7ed6454bd19efd1d7815a65
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e6503786de3d93955d776c366873691f760ae30c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8a80dbed7a0de810676165f293ab851a79cdafc0efb6d538df2b4cdb82d04d6
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7901b119be41e444aac983288c8070ce9ee9fbe3
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:355ec9161e4f9a723f9f5a4c5f159ff29ffac3f23480bfcbd7b78e34c42f5205
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..62030b17125290adc005d3e2761f1e970e2fefcf
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45f5758163811c518a935aff8c0a3b88f51b0df133c8f599abc71ec5e2f08763
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7896ddfa19684552777bd12423795ad78b2a4106
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08783a15539d4389764db2ba8400f35f22f6b0109e367b82e88e7f5ce4869907
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3dd148d526ae150e4bdf4ec9e4ebd4727a76c27
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d733682d60306a3bd966804d5d96eef9cd6c5d15d22236e6a2bfd3261c002e1
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aade0bb0991b41ff164ec84b7e93eb4d175b892c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64b7a7bfd16e1a68f9d0c856f9c47edcf647c7da5a16396b1dc9b2798ba1eb40
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..90d091fc419aff909e5c5a73eb68f165f8ca5568
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f456a3f31560495a53be8df3776ebbf7ce243b671ad38f67df51e92c30785d29
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb7ec21a2702c785ca509cb985299a2c5ff698be
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90e5fe8dedd58f243d884ab35bda00446806414ddb08492189b27c8e96a81c7b
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..beb1e14973f680fb3885ca5a6cd2f1a9de56b025
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fd99d719cc57068a8b0ba0fbd20b1be538f8395722735e5aa4bca04efe54a44
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3678bae37aa957283782bf55640e40dcf63429ce
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:140e22b4ac4edf40142786615420767b3a3aaa08abd3263ae534c3503cc9a44d
+size 199058797
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6f8edc4ee01c708448ac021c4373a1fceea1fddf
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:291ffa83a11647b3ba3c7c14bcb6d783ad93bd039383bfbe59e30595e69bbd9a
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e195c5e32ee4b077c87a4629c0ffe8472c915a13
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:831e757af7aab66b38d1a19657b8b24a2cdcac2e62be86b3e75ba0158b608ca7
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b7fa752a24540581c4d6676bbd3317093ba3baf8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9601a30302a8302a44c6f5232a128a3b4d842bf4189f68c66b448e51548af2b8
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1081fee34555969133696aada0234c8350e1e110
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86f94fbba2beae00b3398f19f8c82d1fa35c9038a18ade79db4bd6f8cd1064fd
+size 199058733
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63a758075a24620edb31126f0ce997b9012fee26
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74c4eb03f1c78ff76d25ccfc197860b34eb969107dffcb294c9a042731b4e6da
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..39b3c541a7e426c83737f0bb559f5db3e7df0c59
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:decb4362250c3e3fb3664aaef9ee50e40b16a1e06424ab3f870e6674106df92d
+size 199058669
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..555527b8594c1c65c28d8dec56c4cec1282e1008
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:603668ff729d1eba769a5957414adabe43a72319db848ea7affe15bd6ec00b97
+size 199058925
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9fa6c09026a03e7ad8b2a5742a334a403cdc1bf6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f21d2537ce6ddece685fb4a8665afb412e9d6108b4f99c1ffbc739fa5eef0ca
+size 199058925
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5441ddfe054a2e85d3a6ea8888c68747ac613f4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39a3eea435b95d11fedaa307a4ac01e6b5e1e0939599e7651056083f10f2243a
+size 199058605
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5be042282d87675ef8d83e854dd2ed605b418884
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:095618680de1a3f15c122da418a914395a86105fe7e541088bf147c287a71d90
+size 199058605
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2d346e853c6036d6c8ff6701eb74dc5041bb9bb8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46f9774e7ea839897a8b811d2918b4f033f4445cfffdda1902f4a4441bd877d2
+size 199058605
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e457226535dc97f1546ae1dc5408149cf65922dc
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d34e3db65773722f1ef20f313f80562d8abc0d875133139634c25644eae42ee1
+size 199058605
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..72fd985614d666272ecc94b0266d4c00d04c6336
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e24bab94ae4fcce9283ce6828074112f721caef27f6149cff7930a069330f63c
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1ae3e5b7f61fa0068cf8369a17c89db274622eb4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:509ba8baecfe5fd4a9d0a436eb98feee8829baf535c0ff41b1732959d79d70fd
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92606d17ac67add4580bb1c498152b6aa967d1ac
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2769a3972637d87bf7d2d600bb5514990b68c02c58409108e7ed1c8da2014fa1
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77f530621a104d5e28005ee65efef558fc2df2fc
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ebf9c7e7889f50f2c3c95299f14abe4a6e798d11d410034c29d3528fadf2c59
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac40992177eb844dd8a82714ef0f33426a318ff9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbed3e7df9761603f3b5a267fd9c2340746510c8fe03432283adf258fa5b0a8d
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a5c4e34336f93d69a9c3c7573cc414eb53a9091
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf2630091288f65bc1e332dd89180afcebd8ed82bab9841f2d51b65455788c4b
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe87c34a86a810800c51526e307d012f8d7cc662
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0a4550f63fb263325e153fc946ce671ebed0ac62957f74c9dfa428f27d12554
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1fa8bd857742bbdba8af803f61a01ae8e43cd0cf
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af235d233e9208ce503f7be31253fbf27020fe5d262f708e7f91f0a1d5e3390a
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..37886bf5aac42c70123db9d1fb5990bae7599999
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f16e0cee014de6417d8192a085627bcc8b194a34aac3bef780cc7434c0cde6cf
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..79c3e30195132136b6886ec07880356744419ff2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58dad3893f67d295fc6eebac0854de03d646bfaf267d5c655c3c0832c1192a71
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a8e65c4c5893a6e5bf053441fa9b081bac0cf428
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e7d7a14051e8922946dbcf457be3461b34699c57bb8074761188e548a6bac4d
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d552b7cdae81c23a6316c9854700f08d1ae5437d
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db76cdd3856b07c2ace0c08dd51b38d147431711a5dd72fc20568c631f8c233c
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8a1297115e6a0409d35d1a30bd122e434d6602e9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7734763555e2ade3ac332c795320d1fca0879271a7734445bfc13e453219d765
+size 199058978
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e578d0a4924c9991c78a369abb939173c5fdcb3c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:988d3764a683184cf0ec0f2fd8af2e424471ed23bb9c07504e09acad588fa374
+size 199058978
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91f86db9813718b344b31f952823cb91370404a1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06b5154d285634e8ee81e13232f5e12767974ff0d10360a906ebeea3fe71cf86
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c7a1a8660accbbf0c5c1c78a221b3e364cfc6d28
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a5f28b76cc3343382e8177adaaeae258ea2cf6edf33c151b0abe9c864d24ea0
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a460da0653fab385a2987e30ed6e0632a24c44e2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c14e1f350aa574570e0134132fd7871f01705edf752a4a3911a7818b0127550
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6f911d5e817e3460d4f4eeb13ada6697682491f2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8231a795d4ef424b614f1e5cc95afd8af8ab7e1526e5b0b3fc7f6b55ae1ef018
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..30e183dc2f28a7a3531280114e41ec8e4c451156
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e93bf1c233c3e73f837f51f2e69add95dc7cbffe2909264e7288e07298e1f78
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b9b314565e8f08d4e89d7438707474a7f3da78c2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e1f1960be0c0cc2968aadea42449c48ba6eeef32f3f65f00179a2c8b21ed4f0
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd558fd5ada1c564d0e9215a11bcc963f6ebea88
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9380f7e33a7cef64ebddbae8a9fbaecd165e3c4756850c7a50bb4b769b6f5a1
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bbbe63e15db7c2396a019c25955bc63f168a8978
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51bd3a021907e372ef8d6eec298c95f82d4dc485c6e5ee7ff42b82b8f874a63c
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0ef6cb9f9ba887e9b227f922cf302d22ca0a23f4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b9ff1378e0d6d7cf2a974b1640d9405edb1fb4f5ee654c66fa628b9b590c22b
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc846ab95417a6ed416da3ab102817cdd7a83a41
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:826958789921637166777fa3d16b25b94d271d8a2c0280f120dbca8742a21d1b
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10841a318964ce5da135c34cd5cbd4583006cb20
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec8226061e8faffe10ec4c0ff5b1bb84487207fb95534f97da4c4e81b3e47100
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ec9cb0fec92206130fda55f86fed0b08370726e1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3378e0650064e58c679daaad14edfb41ffff0c986e8edd54da1d85e43ec6166
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d3edf965f30da65dc3611529569363bc37ab7ea9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee6a6d0225d7f812e7031e4a9e53211666e4f8831afc77944bb02d914a36d50b
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc27f155e0a489120a21f618df4485bb9283526c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91d0aaa2bbda0d6527594049bf603457d3f375e32247866c8bef98eead378c33
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0b893d19fa12298a79f0d70b4006eefa9c626f9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:543a0b01b07e05fb3448bf7478f7b49bad55c30e34439b922f5a9f6bdbfc630b
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8644e7ed1f9c693191c0c362b4d25bd1911f1608
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4a53efe91a7a4823a41da4ae431a5fcb3545f0c2f0c84f4d79a6f479469bc2f
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d8156dc9a9468da0852440bcc57aa1eec5b269a7
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4013aba6f8ba20ab31819bb12bef6f4fe1b015288c4545e84f3e90544267b21
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5701fa4cfea40180fdacc41edbfd35d68deef2bd
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f56753229ed702afd0c33959838825626666a0333ac836a7729274a386c2e825
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4dc3712d4ea0ad2a8c27a71ce70ecd51ed1bdbc8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c87048f8c228684ac5f41be2265a8e1b8ab43e4940fac895badcc7c979f731
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e4d1e08034d1d3425a5a80a93f7edd47adc856f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71a0b5f286e7d1c9dde4e879aec89d44020d0586e3612b24d9d0f36d5ba4ae2c
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..71a4641845da440e1a81417a257f3c7a5127146c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:188465ca2a6ad65bec769b6ae92cd448af489c80c381361f96f62f1c4a250e2e
+size 199058594
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..72aaae2887db848fa5d87b566a69263aacdccb32
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4407ae49e534c9eb86e84cc2d31b12248e111076978b796e26b29a02e24d4f7
+size 199058594
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6f54be65b247070833d7b383a55f1bdcd9f78ee
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c785e4f4f35362f4732acc620159370a4f4ec64765a294e0134269d2927c1efa
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..021be33a3a3015447df52902401e6c0d7f69f547
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6193dbb106b19e044207c8582a1b64795018d5576cbcd0a279bbc03f4036324
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..872ffd8c9e9400d90c9c3db100aa111c188031c6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:caddfeba8363abfeccce74184eed0ec8113761f70f67e46da3b5c68f32a49e19
+size 199058711
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4bb3e7c97aabe11d91f871cadc6b4efe26fcb425
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffaaa599534bdea50824532de2f89f6e408e8c3dd2febfa75e13a6128753c67a
+size 199058711
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e775c38002a5b446f298a8b641c014dd8ad293a9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2b23686570b1c18d8e46cb6008d65f63b7e051c519ead23b0ebc93c3a045258
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..23727ae5ee40d28fa256eb60b2cf47270bf2b049
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:671fdda810bf6a1364ee9142d924795977385cc4069cd04827f27b9c21b07e22
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..882fb300084463ff52e41be3706126c69d61ddea
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:440fe7794856486dc5a98c160fb3e6c9493a3e9ad19cf0a3180808ae99e49f17
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a60a17d6c3a06e2442e1ef4066097ace166ace1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f428772f35418dac7b866ccb56f2c8aaa4050b1db831a6655307e092408668e0
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4d2894ec587aa9d077d656a6420f047ee5361fb7
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8501a633ecf70c4d50c62dfc9d6ba1eb04affcbc955a00c1cb8fef88464d3c18
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c47bc00c4e3d26c42d924dfb05bd5fa22fcaec68
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de99bd36be5e0c126a76a9b3f52be8671e0e3025a571de1eab1cde0a731f1442
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8b826e573cee14985da08758848a8c3e84e1c562
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0473d5f38f2e68688e2fe97da883530558831b068f293495313f0f0bfb5b740c
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5a235fa78b1f8dd905bed04e3770433f4dc56437
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1454863d17d5384e587efe8cc47b92e8d96c4bda9b0727864ba6664c7464867f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..316b12f4276e537415ff70eb3ac4ef9e252b8aae
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfedc46264127e95b540f037ac521a9a8179d8fcaaa62609d3c7fcc9b298bdef
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8071fddd9de8cc0dd85ddd7eb0e5a97f5fa38605
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5064c55d296e86b20bcea0ddf24d4e2ac6e8dfb7e85c0b8d44f036ccb4a6e695
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c0e125d5a3926822cbb8bb027325c208110a3127
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d5a7b68e2e7ef7f2b4f72f4440f04583e430c2fb0281f70b7f27b52ede20997
+size 199058594
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf0a501db18510f702f9a25b0568614d1981256b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b14131606afe1d0b636d4a801ebeea26d2b6bc15a524737316901eec70a49451
+size 199058594
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..365b4028fbc5a5bf2473339642fd2fac5c0bd368
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bad636180409e185328683c4e074e86f851e8763dcefc2d2cd3dd594c34d4e0
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..47fd1ca829cf77cd66429fa42de62b240372d350
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fd7ac0eef588dab6e84bff349e3611c8137278c9adc353094cb65318dafb10f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..385ea0de62437852b69b0dd57c3124f22239c19b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:590bc8a499129acd4c1690673694c98cbeec9ea7b1ce0d5fb02ea9f146246e97
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..122fd7a4c961caaba45474090ece9ce2c1b2e6ac
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:356a65056ef4f6fc6574ee89cb1c79d288af2168f79f70a70b306ff351e3e7cd
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..76dfec51fb872e99f207f1413407b4a50d18c5d4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc9f9a67dae49e94cd93765ca717e8e5bfb28e1864651ca14a34b87e4c80df97
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91c481c46048581f89dbb5b2e6c565800e98fa3c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0489ef8c9c81b9b4a3add26f42636f66f0a5671aa19573d2a8438ccafdbfe4c7
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ec903723966c99a3bebd2f6ac94df5b61a437b3
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b64a6a667ed00214ac527f6aec137dbb639026bae9c34b07b74cd9a1941742cd
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8084a042c74f35ac8b993a8e2e9adaeea11ce396
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e63866617a9e73bac8277360235391ca933716e5043480c5d77f8b35824000d
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a647ab5e257037e0ee7cc9ef8269b7b8c2809c0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4476828e2cb5d62a825f76edee8b98377e59e7dcb7da631a39c8cc17c17b49a
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7969b5259942f58143bbac797d795da5230bd148
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58156aa71dbeccbe6a0f72a55dbb88dc3b89204f805de1a7fcff71007952fbf1
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6d52fb2b52246810fab631f2520990453faccd8c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6c6aa398e4a1edc85d0179615980bb9182ff2001374075317979c71839e8912
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ddd09d2fd84625ff249ed408b3fa8d2b56c201c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ad4e2c072fec99771f41300b1a81a8c72aca249f5efb393d60057ff92ded52a
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d3392e6ce8df7cb22495c5cb3c2eb129154030a9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec56137677c7531d1eab9aafdd9f12acfc7ab4745d9aa8f41963363cfd827373
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6e14cce1c8b86680e8a93dc946f4ffa2cc4a9720
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8539ee66dd4ca3aaa9a9168c508ee11d5d0ccd8ff4fb5b06698304ea2dcaf860
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c1f6874aff98922d9e805986a9d267102e67b23
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:815bee5ef89e440c40a127e2b988c8b5e0439913b898e31c8441e8f49cba502c
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..718c63a10e8aa25ed8f6b36a64880a7bdae2c4f0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2ad3fda24fb3cfdc9de41ee5d72ad1b34eae13921dc5dfc49be472e3e0c0df
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2816d902d85fe0fdbbc356e8e1b7f0ef54652037
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1906e0ce91714c751507d712e1f5005836f7286fad8c0ebca5b2598800d15063
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a12677c6768a8fcdceae026d7a40ebfc9e5a68d8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a98140a69908453784b8d49023735301f4a2c3aecae6681b12e4d8583c09a189
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0e6622a64cbcc4938f91bc97efc8d230297bddc
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8a55ba6ceba8c6e02e182bb29e2a70f732d61926d2839d2830471e1de7113a6
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c89595f347c676dcf72e9f784227f06270aee4e2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9666430260a14ec232b9bd4ee9bace5bff5ba3f274164495aa53c784df3cf90
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10741aa2c3303856b694cdff9998d7e8d5a38bc5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd7e6638a10ba03999b4eb23890081a278faa50cad20137eb90ff7c03831c138
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e7ac40bdb9dc83824fdb18cebe3b79eea2818bc
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55abd25c09fa78e0e027a752bcee371ed663c42c0778c82d85f2d34a5f2e727f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ab3067e68ba8cf9e4d6589d26615733f69d48479
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c2d1845a3a9380937d221d1504ca4d467f6702dbebbbf3e0c14cc3fc3b093d
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f940ff089c2360ea978d63866bbcc73d914be2d0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04559a76bbf120d1a1b4c5500cd4638bb676af08fd830e3d72e3cf86dccb0f88
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..41cc961c8dc25290861d4cc7351c8f29602d486e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:953d99a3242495b38080bf2d5a6b87a154081bf368f1122bfe0959fd75d19146
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a18ae45653d5265151078d0e74f309f4f06df7d3
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd62f3e1de0ab244c8820c0d9882e3624089b01bb4cd50d409336a3e217a894
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..274a0d2f350f4d67d10699b4ba21954899fc33a1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83150c388a9a2f6ec5347fc7d23927647821af9897d0bd7f1b67c1ae6ef0cc6c
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a002b19dd7f09bacd9b4e8319b792f9aa4894d8d
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f70746614172f8fc6ee427bff3c352373663ce9bd6afc685c31346cef462950d
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..948ea6fd0758e9c6b34bebba7a8230ca12ab5ff9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c37c2c850b2594994e0cc1701b304c893c5d6fa7e6cc3a81326789b0b1773c04
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3d7b424ab9dcc0de729051b8e29a5cedc3740c94
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3226c456c5fc7bd8ae69513e2e36053d80647c8d801b649a1d267ade1b987e4
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8a309be40f4a1c708b2270480d0e09426fe1b0c4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97a33ca7f5b6afdbdb66714bb7bb23c0e56bbc45b10e671e8b87025aaf290c24
+size 199058775
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d7228b1954547ec490ba2e054996dc4ebf626002
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b90cde22c6d128e8b89541c7deb77f184c8873fcc112d07164c1290eaaf2a0b
+size 199058775
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eedd5fe9d0f4a27f1d94faa739022151a4808217
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9979311a6ed510060d60954e265059361bd35e339b5267acf555ba14bfc0ad4
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e21cdd6c6e2f9a88f80f2570149469a8b0ec8f94
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b564a24210cb0b364800462a06e128331b4db21fb265d0565083b5fc21558f58
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..156f064e001b319420e2e252014ff2a3516ce051
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5aa014a04c2d5b2558934b412c600e64fca221edf4185c65379a744efc94d0
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..33cdd8da57edbf742caf5fbac76045b52ec022e9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c24b0a06c2606e175cfd935083335553666ca11283180f497849ce97fb68032d
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4510e21436441876f00f54d30f41cd64d07affd3
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8978955b7c47c59766227bee7edb5cfa70a90d793f32ec2403625f9375aacdd
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fd9676c0464fe46c36416051bd9bcf852cf596a1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:621dc78631c06c33cc78343c0a6680e6e5990ce666579269c26ba1d0bc98a683
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..06677f3ce95ac8b80355bdac7f5e391bd9bf4a3d
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f8614f8ca72da98a8fbdc0666364c316dc62dae6997c06d1c98da120a5d29f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f4c76e919dc0b63e08c045dca293745292205387
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4af96692a11f130e3ff0ff7fbcd53069ceafebfcb1ffd6844577b55217bb08ef
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f23a4ef99146aa5e7448bf53ac896504fd8ed151
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea7107eacfe91144e1160e99b893bfc29fd537eb698f5d70ff8f2b94f6578ac8
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..36448d6d0d0acfcc9de8de653bb69d154ac8178a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:983ccfaa1b8323186ae23be5a75285c62499097488c206a40534201a8a5b7b3c
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..62ef70d985f821b27d8dba45b0dd9c7d10639ba4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c4b928f6885335451a64f2616f5f95196a52aa1ab8200b134f70c2f8ba1012d
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3376ae850af46fb9551bff1e114d85245a3614eb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af0e177dd876aac11f1afc64655a0b834e3efb4db6d1716eb586d5b6cfa078cf
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3feff8cd2edca47af236f2c6731d4ba69affa46a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d28c22a681cae4333a548a66d8414bde1258ce3a9af76a05ad79bf2e29125137
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd75a4b3c8ba3aca83ee4d6b906f30ac215c1de0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8707c5ea855f3a59d1d23e8fd548275963f58640eda6ab7bc19800b21474ca9d
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcf12aa7ad33318cce0b311d1b1e71797eeffffc
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9138107d7e63f917783c314afa1218eab84c8ad298d542102e8df6e4a02ab02
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..341673fc4039f00d5118d92c4c00ad41c634f7ef
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1568363a8da7710c01d6b6a26edd9625a4b940cb8d9dafc423f30b025e6532a3
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fd12ed6d2c6d74a66e2a35b7e414297fe082bf09
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c56aa12523348a7c94e6eb878602b14b330c1979ec042a50901d1f86bf9d0031
+size 199058914
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5a3bd3bf4bf05a5220177009622b7de0735b6fd1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5dbac6a9a336520eecaef2d4497ed1d6f0a266bcc16a0f5dd735cd9ac51ee58
+size 199058914
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6f45d60aeba6737d798db629b60d8a34e430965a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e55eaa949edf79779b536331db1474468c6e1ef0f687b148e35b95aeb878112f
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..269f8dbb1e7d965a82cf80bfaf9fda63095dea75
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0158c31d7ed00eda6b46221c71c6c8d299b496e97c984cbdd1ae46b0e482a7b7
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0724ff3d661459e330841d7bbf41b5f9e11f2867
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:427a07e0f55a0e232516dfb4e3e26cf175be2e15c5547809ff63be3f0c98213c
+size 199058711
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8d4266faeba4413a685a7d1b8007b126600e8fa4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e3c17f74333164ef4de86572406d2783fe3cd529b0328a793e8fd2180c17762
+size 199058711
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aac45ead5bc19835d3ef6f31fe36f1914592feca
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:367a6d7b76536f07ba35ad11381e031fc23986e9f004230c6ac325b9ed8e7f3d
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..903b133f24f5a7f9d65f5535896e53f5022d8eb2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b2ea537444f962857c950450de0ca2444f795d287408b8c6616087259b62107
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..895f38d348686f43a65d7271a5276c74e7176fc8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd4107646ed4a6bd7e153c6a3472ab428e0c94d74bc65c5ce278ee1e185fc99a
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5e51f09717c5dadf920f6dd679d4be7ed98151a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:980a1973760aef0b0fdb6de835da11da918571d8c4f751f1428eaa1d712e1746
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..80494ef0e5a1e31a395200e6ccf3932bbfc2fad5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd142e19f2b0ed1d5150a9dba269a93920a8731592a3f4ff4ffde5819c61a88f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b76de37378046d578a011dbfa61264194140255a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4aab729c9d62d3ae5c8af7f7083af358f9df166e33efe0b161edfe2cba54c28
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..863fa6cab8b017f691824a58c9f208c6021a7272
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20d87b94de57c60157a1fca49c9ef34c577f06d2c72d0142ea325747486f7c07
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c72256172d3f6acec21166485b54c2efd2f939b9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ef0488cf84759414e77c85ae5d727092b830a4a0a40f08d07fa98d99c72549d
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1855c439e9ff4bf761960e1c1d77c94e2ee9445a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcb96703ef4821dfb9a2f6ea2e0654117c16cac29529a183235c5bc07f9c2452
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..18d529715c493d85dc7f3629115f88b3c3c62a74
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cb546795595988c70ec7b328f951072199e6f9ebe812cc43a71c151f99e64e8
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..008c24fac12e480d0934a26807df74e511d856f8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f07cfa71231b18bee3629575024d16aab3e31ec11a19e7728fa8fc6c1258513
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..29c0980a83ac35643b8009ef68f85724fe84e3d5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:199343f43439c17207697e21607231e32effae7e25e1bc9952293b0cc930ca09
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e620e90de45f080ead4a5b0bf71de97bb428267
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6f632973f845945b0a03db890dc1328c697b411ac152f49f769f26d089531f6
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a52a43e10cbbf01ceaacae846fb4623cfb738164
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:196f91dbc392cf693cdb524018fabb148d56f40bf4512f2a1e50fd1ed2142e7f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f17abb8522ac80afc304983c4f9d4f7cc2830f3a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f51ee371263b99f2b8a0eca8b271446779c55a8b2db402396eb755958d35318
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a3e5745143d65e856dc1d9a384931dcdf4a26041
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:711ab8b72767c591bbef0ea08013381ef9da8284720dbce804f1db6c9f70f6b8
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..48f28050e99a0322184c6e1a368785277df02eb9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e029f68e8128d9961c82f3bb19f49bd01993317620a972cf1c4d64a8c33c497
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63d805eae82dda1fb524c729d8f41f985d92ad8c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b89f84ea909a717fef82270c902bdf51010d96111cccfe4a81c41248b28b6a9a
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c2cd975da0941d8b414dd7eecfdacb31f83b98c2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4913713219b7f17857b114e97e5d1d3f22f8e8e21daf53b53b9efef9a26d4bb4
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6d7eba47770429573aed7d37d200411d05e396e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23235826b33b477eda545af51bfaf048dc3becb57b10460f624f6ff35dffb07c
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81d67471f887b8f005c04b41e0ecb410b92bd846
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d5e9f6f9a51a5b7414bdbc730d8eef6c5753cd23bb4a9e9406af15c3ba425ed
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea43d184a1f9596da201316e79d1da6714cd6d67
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c0b5e25ab51158194e5ee54add98e26a59fe13251637afdcbc43c326a1eaec7
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..61371257bb7b89095c2b77c775b0eef75c01953b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:076748ecfefd86ab78f499ef5aa23489d0a8e3753ce560dde2c3521c215caa97
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c22bd399cb522f3a63d4e1b88ff1873bcc1f60cb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2efd8576991b0e244d3d53b8eb2df305e4ad32f7e11998b8be94565691f9ed99
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6cbeb252aa33a8ab0f2b7c99d6f8cad2ccc2c00c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cf7efddc5d53aedd3cb5e6d28f281e171090679c26d30d4d509f357ec0c2c59
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4d42c3d2d8f923863c2b55c4bd76acdf66b406b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b3c7685725c405142afd6e9e6e2069aee2c324aee47145f88f17cfdf095c13
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8191196236bfab04360875792bdade38b15c78d5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7973df59b5973df5451f303524162c034cdccdbbc8320696dbf4481b01010a85
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0d7449f9385046ed626d16e9b868dbd79106cbd
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:223f1e483e485efc9c66e78d75fda9ef6d22aa57c02ca1ebb92364fbad1fe65e
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2353aa38bf906153931aacb99a7bb9e7054bc77d
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be13e8c64172d35e8c93e78923f32669721555edfdc1a72ec3c1b238db8b6667
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4e50e3ef239d7668312fad36088919f080baa8bb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c23b1af1f4c867b69a6d084b1b9c82d2de3e0b0c0f26879fe528a68ea53c7934
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fdf4cd34cf8d737644888c3e659ae7a36d55e6d2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1efb0d8794872d56d791fa126fbc50e67d8927729d2414e875e98f2c5b816887
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bc1ae060ed9d6ea23e96e9d9853dce67b05d5c4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27e896c9139f1fb9053e0ee08124240e141a2712adc0fd1884f3e8cd357f3a15
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9be789fd4475f1e99b89233b598ed24249fa8664
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da60b700e4834992151a207d8bf857daff46808cc150e52b266dc2610ca8b5a6
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d100ab2dc30db6b1cd729cf5923c9ea4bf6f13de
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26d65228e34b1311c55e452ad7f8465f9f2484eef769e6ffa316799e463f6d42
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cad3d9b30eec36c3698974c5349d0ed341fb8f35
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80ee80075f5d0ed1681116046e9e9b3a2b5d555a8f552922439869359ff0258a
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cb1f99f17fa63a1f7b4e1e4e424b76b2b5372e1a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6129827acbbbbf1579c5c1aaeee0f791489f72d3869d9d5b94e30070c4c07c4a
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9138d2a31e6913294e018521657f1d8aa0f32a3a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa23dc782562f87bca5fa8f26472a4a41d6cd05e233e04973ea1641de945bb74
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a55585e596752df19c91c45e0f7292d59fd5c0eb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f45ac0490a12ef79a57c22bb142fdbbed15bed1ee5270f706660377e3845bd36
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d6494be5c9daea2594515cf25e90907598be2fd6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8bad8a609d0e4febbf413a4df0d8c82348b5a572fb71d23af4d0852b53c5c30
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc122f845aba131e345ef6828bb907e65cb40e85
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7edcd3bffe02dfabf003f4e6338a5ac9aa4f9c011bf0214f630ef1de46e962e0
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ff7de1c2b8dcaf19aa227fd3587cc52f72da224e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30db5cb0caff20ebe556772c671e39221cd8c1a0a5426f4d42aa63a65ada610c
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..31df04360e657d36793fd0af6b474fdc8f3e9c51
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4398934bcaf2941ea154352d3a444a85b4174287d304675cd8d4d3aa8e12213
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..487afec3d64d49d51f995c9e4367537b38e2377f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b84017805b09bdd8c1ec48cd348233d33e017dbcd89deb0e4240e92aba22b1
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f5aa1ff8690946f25f0f2e09afb56a71a88971e8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3af0407ae0fe01b1f72652c169d8a0e0a0aaf1c84e56956518a1aae31f2a6292
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..842a7fed9a9724d2f41d388c5b9e3df6b5177c07
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f4fdb280f7b172e312f0eaba10b9c218e4cc3687b7d869c15122e3a42041d91
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eddd768292481fd4d9b0adca85ddd69c5b1f3fa5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3aff650a11c358dd1686c6384c91305dddd9e3edc650cb4001d233b071e6180f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..173d4779cc3958f3cef1835c893c9ff6f869cbe0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dd37dc3d55b5d775337fea966facf8cfbe12ab439337995029957386ded8a68
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..159d4963f4716b04c0867f7ce15634938d4624ab
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:930c05522fd6cb716946a699bc4a95a7a5075c1de6ed965cb5515b7cba1909c5
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e126bc5325680dd39c478d27365ca3f4fb1eaba
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff38f3ffbaf89461ef1d7b41e403a9892b0122f6f95217b634c2cf0ed863d67
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3789775d081edb10c1d19ba589047f2daf7bdc8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa7cfbd6623fb11a05f2e204e5bbe45fbe5e057bb331d510538c167447b919a6
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fdad14e84965fedb387807ab62cd41b4e15e5b4b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:886da1263423202faa41f45913615862543f2c30b264bdab53bbbeae959cb2ac
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d40ce1b33b349839ec6fd816c7529e07b99c0eb1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9eb96cb0ec45d16aa21b4a7f7f605a1e781e6938d03354f42661faea9b223d95
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fdcdc89fe1b7195e7a13fd7d10e4c5dc0d005eeb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98483ae9deb459c97812326c00bee3460771968d7425c760d3a894d9e7a8b19f
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a9887a0bbe9fee4e0a673700a19116235e59255
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93d5e4302cef6e0d957ca4455a4040b7e4f7d3d0f85750656a0ba8f806962df6
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5582a068aff5829571b90c61f0e7ff6b27aa4f2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c394c8d5eec8e30b9ffd32b891e9ad57fff4566f746eeaeb410536f76edb49f
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81a937efd8e9411e04aaa63c8d49c35292b9b1e4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fbc0847d598d9f3bf7c9dd718c03912fa5f534af691d5244f21c328fd37be12
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c521e0f4013f5874022d2cb13236fd2851d30994
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:558bee13b806384400007032d2cfad75cc300cddc07a8dedb493fe7d1035ef80
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d575f05da2810843a9cb74dfab2177b1494c5503
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8af9d2818b036d19ca680eb29f7cfc3ccf99d2d7b6db30b609707ab937a7b75
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f7c77945ad493663b813970bb9875096204da5ae
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36872c231e5911fb729706d6677e69679f2533e0d0214dbd8e1d703b3bcae26b
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af6459c4582cd863a89599e21fe7d35be71a580c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:806b87d8aa5c5f082bbaaaafc852b715f632b31c1ba7e4f7629827c448f88e0f
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef00d722a308d9158b90e9b5febb9fc59420b77e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a937d7a61420b46fad2218b5230ec6f21961f629b94e6de8911e00760883055f
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..975d9d07c882b32c513f540146c2b9868cee7b4e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:644051b6d4af2c9b422f9b046f4808cd285b49376dd331fc1da85699603013f6
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5a20fe17b1f9a922de5fa9ae805cae72f71a6f4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f241b78f021d7a246fc75242313f36aef7cf17c5801068fad915fee1e31b6017
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef4bddf35d3c986bf3760f9f79d5c6a8c2a0b639
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb61d66798904a47440fda3a34effd7462e5744ee644ead3ea914a69fa84aa12
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..383afb612823f3d36b9a64d4be8fbafdda8270d7
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b3eaec6a44549da1af9a129c5b1b90f7cc1ea6b0effd42b001ade13e60a9f9b
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..959864fa427d5a25eaea1f5ed2a8b01bdaed9c91
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e09397e54ee5583a70bf08a07281f78227d537fd628768590964c4bd74c4718b
+size 199058647
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e1e35c9eb592438f57cd9a436b2854cc2d4e46f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3fd64b19e10e4d03c73b1c13d9b7ffb5553b7f444c1f2aa4a7aae6be724ea05
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2819416ba8fb3a66cc4a9e55caa6295ac3b80670
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e2dfe5bc80a0cf8a1bb3ada7836f7c7a75012deac10a4a423543ac06b86838a
+size 199058850
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ee327e99bf47702397b782e92516043063d73215
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82c809d2bf8fd0a47ca376eb099ae0fa611a976ec4b63f2cf44bf55653355a4b
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3ea1e4c3c514c512758679fb4a0fcccde1b246f0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:032d5cc626c0805dcc25b821c474e9d8565fcb0d349e180538324a9ccad3bf7e
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..26d0158cddaecdd6af2b756ea848fcda113f1a08
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc5363610f32d0b38b10f0119178c2c213abe8488b5c3069292417681ec7df44
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..41eeb17ab50d3397ac22fc4e7942b8b698b74dfd
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0834e81eff99c6b4a478343497fdd89d82a546461bfb4e3e463d4ef30ff7cef7
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..359f330c216e0ff0678fcd3f78c129eb27640bb1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e27fd6ea1496a4a7c1c4435c173b2883499c033c4a51b094cea11212927cd20
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92f09bb1e6ae553e9feb3f2b17bca5017c82f37c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a37e2f4c415c78d27bfbf71a1aa8d6f429824738b72582cece3c2d8da8dabea1
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9dfd3c845bbcca54e0b7f913ae57cb2655908162
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35022f43dab4bf4048ffdc853b6f6e818ba7903d5c7643591b7de4029a3759bb
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7f1ab710a0ff50654bccc87ac7faa09b73e0c6b2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0934d1add3cd98c8a24233092b2f677935dcc53dd7d454ac4c4f84f26487640
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9cde30c4847023cf7f31d89a7635fd9a42306eb5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f2863099599f1e30e20781046e9baec2269cb21540e849b1c0f652b6cb17f4e
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3018518b6997a2dbf2139ae462cae15ed8b8a1a0
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01dc27782628708878e2105cc6b4350f002b5a17bc1512c3747b0d7f39db2b3a
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..931923605681824033ced8adcc13846e1976a8bc
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26258c7835bbe6b9fc26f8c18516c900e991d7fbc95144fbdf483ca684f44560
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..39d1526b080cce30ca41a12f9a17849998c946e7
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7a01c60ca48577a82da0a234f78668ab907d1f08762e5ad01e6eeee935fb5e5
+size 199058722
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fac9e127acc021c011bbb0a6abad444504430e21
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ebb53b8c7e3bd2bdae0646fd35265ea9aa2426d2b0f4bdd14c067446d9b5829
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd3c231eb79db5c408779bd986444754e227ef81
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7fc50d23d843ea97e67af65110e7abe9e1bf22b5ad6cd7d3689251b79ccaf48
+size 199058786
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..85d3d01d3f49dca0f213cafe555c01618249d59f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7068fed9e615ccf329cdf3ecbb57ff9f564c93b45b3ae158b3b60b14fc7470b8
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4fa20965135a87da77210b1be635888294b76edc
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5418bfe28b9c75d0888a2eb90749f59b767c407900fa5a0e8f3945c4a314da13
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7401f523c6feae8f7ef5ae8dfcc95e410a4321cb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c281ac2ab77574d8065ed6048222a1fe12bd849f75bea8cb2765565f59cae636
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..048b6e613aa44314a7597e3da57389afd1da82c2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30667d1d58f0370170b6d63150e8c64b598772fc7c56cbf17eda1b9e30d5d47b
+size 199058658
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c0aa691c5f832c0f59da58bbfaedb6291c2340b5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7632d9305e86433f3428781bd4dec95bb9621891c3693c2e66f5842614f568a5
+size 199058839
diff --git a/4b284b28boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b80e578658cb4764fb451740b4e33d5ecf393a99
--- /dev/null
+++ b/4b284b28boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83c09117fc8a3f712620ad50d9e05fccce106d0c56ef6bcf61c6be93fd4f92ce
+size 199058839
diff --git a/4b284b28boscar/global_step80108/layer_01-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_01-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2e8b5fd03e8b2da32ca055489e9ca74ce9c2123
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_01-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fb4dfaaff737d2b7abd1a2914faf54af82a305b30fe89e4edbc1c86f6eb2618
+size 167511299
diff --git a/4b284b28boscar/global_step80108/layer_01-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_01-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0197ce2dc2872f1286d29ecf64a9bd4e6932b132
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_01-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e33e104c1d57659ce9413177e48f23f7f86f11b0ecd69c1a0502658a036f27a
+size 167511299
diff --git a/4b284b28boscar/global_step80108/layer_03-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_03-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..56f827e6f32dd6c4d6096cbb36d6047a083ed93a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_03-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca27c7edcd4feb0afcf35117542da0230a31317dac6110f1ee81c5702030254e
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_03-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_03-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..44b3b95a53686272198440a25ea15368ec6a8ecd
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_03-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02277a1e5b0e338294d262e5bd42c14aa18b65875c8a01a257cd394950d00f66
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_04-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_04-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8de4ef0ca21d9baf5e0635e0f49c02735fb53e6a
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_04-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e2f6ee5da84919e28dd043a802989691260d2bbb2c15fcd6569810e9dfe9c48
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_04-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_04-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d14603a5c4e6cf7fe7ea896d2b30e3542bdacbba
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_04-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf4cd023e2f0fe9ab48bda72b853b206452b2519b34c9cbeddb37ced509924cd
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_05-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_05-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2abe0a35a324864f8831731d8b995ac9ed3eb041
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_05-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d34e39e0be26877f304990264183037b1613018f38c53b3dbf4c6382a7e670a2
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_05-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_05-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a3e71a45f6b6bead191df60fa207eb4b66b3712
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_05-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:167d8126e4285435d9b87a16bae81f2cd3f326576ca7ab9ae703702ed89b0047
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_06-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_06-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..febeef9d53d6737f8b6d8a3e33125038a1ba81ff
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_06-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e53552eabf786601960b791ada4df666451339ceb49c2224d89319aeeeca3075
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_06-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_06-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbf3b53d5bd36b2d405ea0b4c87a91d8daa70332
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_06-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a5fa6d8f282fae1cfa02a0326c1273530de8211ab0df533fe56eceb44616084
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_07-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_07-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f646f0bc22320ae467c322bd4ab7f7328d2371d8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_07-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99457e39c58d0030cef014d32b36f78af447cced2b29dd552a724da3e7d2e33d
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_07-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_07-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ab7ddaa90f71397d7e96faf55382a8e042f8652
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_07-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e4cf317d2137bcac3e9cca121771676fb10687d280d4c45700ae48ab07c44a4
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_08-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_08-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..370d0d3935501aca98152833f2b510a01b2028fb
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_08-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63d5a34a27f35c1d8ecc7994d2b0ec9fcb685f3ae7bdaa92275eca3716fc4876
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_08-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_08-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c556f99fcfe7e27d3baa275f3f8202f86d47687
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_08-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31c1e2fee76c1fcc3494b4be297e149db4e66fdd6f335b6a1c68f9993ebed86b
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_09-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_09-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a51a491af1a3051b9b789edb8c7eef75f0e11a30
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_09-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f73b3a491252c13bd7cc16546e31bf82e97baf8d16efc5d2fd08bf24c3904373
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_09-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_09-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..264290e5e6ea5f864daf99a16b9eb69feca3b364
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_09-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ef4a5bfb3d3a8d645b29988899d9b2f27ead2697614b449bb8ae93926287fff
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_10-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_10-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..80aec2a04dc56cf994884cb3845ba4729d63e3c8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_10-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:269a42b5db6c67b94ab5acba83f71c1d2473b1c116348de92957905e659b8852
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_10-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_10-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2666aa3d25349b88a584fbcab77f3f3e628476d
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_10-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a837c9e1e15b954d5694bac00f0aff9429046182c92ceacabfe90ee561884851
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_11-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_11-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d6533377d6441580d01ef5af6badb4fac4e1d0a6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_11-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41018654008754841498b549e0f71f1d60f4221811ecfd2feae8cb08e5536000
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_11-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_11-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..67c5578347993f9b5d5b53460006d623c02055dd
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_11-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0e0ca965d5f509a9380a88d31697324f0d7d2446feb491438d2bf9cb29258a4
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_12-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_12-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed89c5866f10717f077b004bf0238efb4c754b57
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_12-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98ddbb83549e8cae86df5e09ee0c54b2c40c8835da725db931c44e79d8920704
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_12-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_12-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef5c36c412d3c9031c894b96cc1e10eef7478be4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_12-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8f4e1b3e22da2cf06a858a71b0024b15672507eeba65cf0e6b5ebca823b4d2b
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_13-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_13-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..39135959c6a92c8fd2a93f96f823596b846fb8ed
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_13-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b81000142c21862f95104fde780b83a3f75137c7bec8268c5413411ad74a4a00
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_13-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_13-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77096a90fdab6e9a5e9765d2c9ece685c3582906
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_13-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:725e6306f05cb8e1f8ae8a05476807a6393e6778e24d05b8a740d53c03e3855f
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_14-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_14-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3781e70f10f8965d4ffbe1b867fdbac6ef1a1dd3
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_14-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5fe1f38be6ac83675a2a038c535682272bf60dbf5d8a5dd99074f897959725e
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_14-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_14-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..937ec6455a4ce4ccac45605027d8cc107ef56f55
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_14-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0d5ecf49bb1c235015ad6bbb2d53376f8ad41cf822612dbdf165f83f9c4cca6
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_15-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_15-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e257a787c00f3bc6f0a69adb578b5284b27eb2f6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_15-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:414aa8cb1f7637a3c7903166fc8c32b0f670e0a9dda433e768b7f70a83d1519f
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_15-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_15-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..827edce3ec3fd85753e5f2f618a1040f9815c940
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_15-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:703bc88ee422faeb9232b3a10243a19f03136aa05ffe6f22462ae059024b2813
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_16-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_16-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c1ccfb82574c71753edba27079aca1d2c1eb311f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_16-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93493a90dc28f971d5bada827bfd2a91f0d04725b44cd85433fa235c1fc5476f
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_16-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_16-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9dd1eb557f7a120713cd1a67df7687071808f53e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_16-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:010850413f6a43980478b680b65565f49d4e17c6671f9048d73e2e088e40c59e
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_17-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_17-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7057ccd3c19b7deef331173f8f29f96d392fc015
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_17-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40ae6dc98ef7e3f4d731c0e7e7c920bc36ecce8428bf1ee18790435277383d1f
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_17-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_17-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0d439725b4d4dace31a14ceb32d39fe23ee761c3
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_17-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5de108be9a302328f0a30ab89b2b6db5a7777191c162872b91b0e075c0b63e48
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_18-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_18-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6ea94ce3054af3222ed5a0953b6ad4e4c7fca87
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_18-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:761a6a827f1e87ec7f110582d64e3fb74f071589cd62daf32fe458e746dbb18b
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_18-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_18-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e3fa31262210e58dd5f082c0a84d116dba884bdd
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_18-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca26ba85f96f735029534b7faf22d0c297527f895d29315c92812d0766e3611
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_19-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_19-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..67577f7ce2a8c6c8c60eab2065e7b699894fdc90
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_19-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:280d3c4ed1ecd40a804b709eb5c0d59617ca8b0aab4c4a62643c176b2246970c
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_19-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_19-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b8a07ee1fc21b309a251d979a1a9bddb287a82d9
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_19-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e960616e771b4fe36e4e1651ce1ba66dc4c368c6e32f859e52e2987453354870
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_20-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_20-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..54a46e6c281286aeb0a178918b433bed4dc7d993
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_20-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f7b6b8f04d52f638a7d471fabd3eacd82b3fddb56111c2a67954820c061fbfa
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_20-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_20-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcecbba6f1fc2027a09ab3d86af3b10dd5303f44
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_20-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4223fcac4d9fa44bc768afbd55b8b00d2da803af443f548a0d457b2b397b171b
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_21-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_21-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d078af3314a53ebf0156c822cb5f34e77bbfa564
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_21-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c348b0c1259791c15871fb480ca6a24d364ba14addcf185ed22e0f4d1b84ec76
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_21-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_21-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..000e64d3fd749f3b71e2a70700823054010b0a05
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_21-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98e26b0505434c63d2bb12c474be446dcd5f50c3b7e1f8c465ddbfe712604e55
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_22-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_22-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cb2997292205d94307d1ccd30bdc64606171a56c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_22-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d0ebc438784d6915b64933f0627186f91d84da076558082564c990238a5a3d0
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_22-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_22-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b389a829237c2e7c35e1257fc08754a8cc5efb7f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_22-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03349cd7699cd8eaeb6c4f70ddf3706cfeeb3e08f35e7efe6246c391706879ee
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_23-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_23-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9bf7a47eea916afb3d69777b384447ccf0f4929f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_23-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6068cdd222fbdcdaa3e315f4f3f882b67561f982b1406b6627b21d9c51d58061
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_23-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_23-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd73c3c9eb9ff5f1953c7a40e02ec66eb5eddb37
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_23-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de5bcd521d4a4f82d03aece70c7dd204de9a564224c3d489517fc87483e0e67d
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_24-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_24-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1de0dedaa653eab8318b764fc8cfb80dee45cf5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_24-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc77d3bd68962832163c394024ebcff93322a9d023c33f0a1c48626889ebdb80
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_24-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_24-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1e3fbbc4e70375c7763f88426e3087f3385b72c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_24-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ee6026559ffc35e655b6f2ee6c3544ce1c4431734e314b2cac1e492f9b29d81
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_25-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_25-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe02728df59013cf926891c4cf77b309d7cc2a10
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_25-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8f657d3cfde86ad3b069a53ac0fb4ffc24e56d99a0b8896d44b9c64c0c524ca
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_25-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_25-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d9071ab239590e351a3129aefc77c3e1fe9dcd26
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_25-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:629f0b215c42a9dd0c5a67b69a1040526c610ec8a090b170106928b795c0b2bf
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_26-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_26-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e796beff3253adc2ac868e8746a23de4422c6138
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_26-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e97555823ba5750267d3ccc5f44d00a767e4b668e7b8711095ab5803668a5aba
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_26-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_26-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..046c8834059bf24ed148c11bc7f6f8a81e5b0aa8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_26-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9d1a9f158ac7259e2109ce8635a2f845174bd187824c44e742dc1ddee76f40a
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_27-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_27-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1edaa6af7d0cc097ae3b4f146f15093871d9744
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_27-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:524c476df87ab8ec8ce8f04616a6ad58b406d0b16c3b953efaa05726e00aeff4
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_27-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_27-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..64be267404bbf82b4b235ee023377deb72a54a51
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_27-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3c516608edf0e43fa78317e8c374df043c7c27cb3c5c021b8e1bb52919ccb23
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_28-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_28-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9e2f17ffcbf9d71d5b5798b5438ea4583efac9f6
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_28-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7027709a7eb7f1e63234eac30cdf5031ca6a9a283dca1fcbf2339d27b95bb51
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_28-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_28-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a3b718dabe1006172c87164cf8c49635ecfa7213
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_28-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a937ce0c9a33e2a43a12da620616ebeca44784c3cb2105125854addbef6ef9a8
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_29-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_29-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a9a83c9148ecb5a5bf041dfb9f5912e8a7eab48
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_29-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2b85de8be11ff2277c662762305ac3e630c625b7261cbe1c9ca28b81d7cbce3
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_29-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_29-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f8c79439274748c28cea86bb527c462bda031ca7
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_29-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d70240dcad57969460f35536c65786a256721400429d40b08c6c77839324f39
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_30-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_30-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e4029edd146871ef3906d6597deb8dd54f3734e
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_30-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba655b05a920e809579ea2fb06a0c8164ba79e1e7629325870e8ca2be359649a
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_30-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_30-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..201fcfc6dd3a4d8d685ce4c0cce73bc450b02d28
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_30-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4487c708ed8d41d3f04dad4d4df014d438d744356faf883f394d366aee938c78
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_31-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_31-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ed347877593e5445ab9ab32b59ea998b8ca6f0b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_31-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9916ab2e12f26d078c90d868eb1801072ce028bc96e215083f0b048e066d738
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_31-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_31-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f546d00d9c16ca07e8748578fe5eefba741f3047
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_31-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61f2da3aaeef60d1ffa9b2bc649f7b509a3b409f814289749a2e3191c7568306
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_32-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_32-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a38cb0604912e1a6e3a85a4d9a5be55cb86ab4f2
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_32-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2cdf29b8939ba27e229f5486528d0a05da0930c4ac6b0fef8a01ee14833b072
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_32-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_32-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cac3c82a0b57c7c0340a2bdb09338fe22652b152
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_32-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:730b4a7598e8a5f59550ea5505360ed09278d11f9eaa708e19b0a1a9528452cb
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_33-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_33-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bef245a0b9933e9ea6b554f0e54717d5a2ef973c
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_33-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8387648247ccfc4a9ba319a66cfdf86b5a202708ac284c1d6170310ed9288c72
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_33-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_33-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ebe83cfa26d75c0692630fb5a170a3b19f567187
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_33-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18140fe65ff1625f81c362fb68085622fc84fe50d95c2982761f3b1763913ac8
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_34-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_34-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b035943f510a298e8fe5064e9d87a2e2ea4625c1
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_34-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:956979d3b30b3685578f6ad2a77b41b2f31db0c0434ff5621d248c7be3ad4508
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_34-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_34-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed56c7601c6ccfeb59fb808dc4f61e5e568b24a5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_34-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9acef182b45cbc6a82fd07d2e090b619ddadf54b01f90825c55d87684125898
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_35-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_35-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81fe75cf358297f05aab848958f13644e737323b
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_35-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b3ceeec05f45c8937b5c51758ce9a40574ac70a45a56b838581a7e5d0ceaf88
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_35-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_35-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0383dcba240d6e2d81e155357ffeb118352751f5
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_35-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:179879b55e7b8ef9ccba268f5dda984e429bb46241ff5d7fcd6beb201c0d6c58
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_36-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_36-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..932984db9fed52dfaef8e0ec32613507dde37355
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_36-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8ad6040ba70a9a29d4056e56030433c98490c29b18a5d5735add537d6c218f
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_36-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_36-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d8998d73f6c9ff377f8391d5919e55b9af7d939f
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_36-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e483d7f7e63538cb5036b5dd246802e04cf0508b967c69c6c6401669f8adb072
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_37-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_37-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2792f0e8f7f55ee8da8d005b1443c8a4ef003850
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_37-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b05b5790b4c33c7d6cab9c4f2bc37bdab069eb458dad4e456fdfb02360af60fc
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_37-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_37-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00949be5904160905f2c2ed1feffc1b47e70f384
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_37-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f346df2312b06f0a8caa9473d75d47e082616e23029e153e67fb5fa0a23d0ea
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_38-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_38-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..908b73fce3670701ff3c4643987288bf615b5dc8
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_38-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b436ab4f88b857e24ea3ffbea498c602ea009bab620d815fad5ad6044c42b7d4
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_38-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_38-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..59814b4e46c3829588f876fc71f308d900e2f810
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_38-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2242bcc2636c2b14d8ec2d68aab92dac742d2a93f4bb0e5136ffcf9039b230fb
+size 113308931
diff --git a/4b284b28boscar/global_step80108/layer_40-model_00-model_states.pt b/4b284b28boscar/global_step80108/layer_40-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..981ae3e399d4f121e23eed19a1be486dd2f76ff4
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_40-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f2cbe9493c81d199a941525617e081278ef0a8b766e84b56736124d038b7220
+size 13507
diff --git a/4b284b28boscar/global_step80108/layer_40-model_01-model_states.pt b/4b284b28boscar/global_step80108/layer_40-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5a9801245df624132e13a928c3672b1cb608a2ed
--- /dev/null
+++ b/4b284b28boscar/global_step80108/layer_40-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c95fa7c9dc70fd7143a8894c57235939413bd82cfba20da6eba1c7e0f0e78d1
+size 13507
diff --git a/4b284b28boscar/global_step80108/mp_rank_00_model_states.pt b/4b284b28boscar/global_step80108/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c361f464e174176cfe9e747bc97bf97dcb6a0622
--- /dev/null
+++ b/4b284b28boscar/global_step80108/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79dc169af7174bfce315549fa719bfca12232b92971ddeff4cb02fe743fdff79
+size 51635
diff --git a/4b284b28boscar/global_step80108/mp_rank_01_model_states.pt b/4b284b28boscar/global_step80108/mp_rank_01_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..587c5f0fa13e42f20beafe4a672afc7d8691ac08
--- /dev/null
+++ b/4b284b28boscar/global_step80108/mp_rank_01_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0907e92ae576624dda614576bb9c8e3ed6934f061e4f1868e0dfa3c9fc4ffa3
+size 51635
diff --git a/4b284b28boscar/transformers/config.json b/4b284b28boscar/transformers/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba
--- /dev/null
+++ b/4b284b28boscar/transformers/config.json
@@ -0,0 +1 @@
+{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"}
\ No newline at end of file
diff --git a/4b284b28boscar/transformers/pytorch_model.bin b/4b284b28boscar/transformers/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..79124f60b83e1dc550dfe852680f3894d6ab7e43
--- /dev/null
+++ b/4b284b28boscar/transformers/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93c97e0e86276afc11f7b32986b7c9f2da9d9baa1fd296fdfdc933eb2af5a596
+size 8781203669
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dad785d87ab1651797df9da507ab8b8385695354
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3920095188313853, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05821741416066471}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07689265316885617, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021889644764877354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3353633364771332, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005464034559346198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.10974192875795953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002108481379704356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03467177168437793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001263966142663848}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1501022471408039, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0034656326105627303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04944058280627724, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012733709222344247}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07300762564486572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019791385357395687}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3239944481490108, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005278665955249544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10487292177099015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019145881199173131}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07173881674469755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020327580166630914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3108299199198791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0050441398235876655}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10223587057095268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001953957091865035}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1cdf7a895fd0665525366ffce0d3fc0bbd74c60
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5923583934046589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04413735939354442}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.16010050130964712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005125042778102285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3196355072168386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005119850964520928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1697750677390444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038772039426472724}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.08438981052215226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0035428710754014017}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16375709757632376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0035801758170607334}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08671853497591266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002631475278488475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.14464852171051198, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004587920039107864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2991072637349091, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004697289445064134}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.15421967175389004, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003335409750319338}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.14773643541948675, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004697318924441809}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3022253960230997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004739701115545982}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15698432412836977, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034258507511626024}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a0b5a036324041356ea2530de9fbf1c31156dbf
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.87481699127398, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.059582002804657114}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.199466981650156, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005680133053279315}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36987834122063556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004990035433475184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.2091307571621392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00438758838932181}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.1097332483868951, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003880260643680688}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.19989822315140882, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003891995132588205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.1125014602113282, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0030818329485191713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.17708218948059182, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004959206002368757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.34473442974785634, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004653107328809787}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.18777604196623215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037686741944170416}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.18275767079289593, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005158519349266604}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3495549840141228, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00470047440035223}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.19268792375031765, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00391282714541826}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b01d69ea4fa06f1a1f137ee67d801fe36ce48fa9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.917052522385174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04515993268519611}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.20888904165496103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005880585457367179}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.37814776640994463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004965150831364198}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.21372711407619938, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004343895044489938}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.1168574620986129, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004100382644887659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.2033520821796969, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00383298408380243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.11478946258154298, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003021059673039041}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.18605811290952207, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005155344402670075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.35240104768920266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004560857947768766}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.19232862909601875, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0036812800257055574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.19147784744930493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0053469173736333826}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3571044253500232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004630285578564224}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1967270339578505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038181408662667426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..86f0cd706ac8e51dd3c740fce342222c17dd4259
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 1.0770812969732624, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.057150178907157206}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.22981660265238718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0060091983198828705}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.39885875556534467, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004954969825753986}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.23556795007723105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004533277698287348}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.12750847088352804, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004054232891970157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.21938886368566654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003991587985278903}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.12790770467752932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0031721283738045283}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.2007562947847461, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005097419240769892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3691187056685199, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004545072578815252}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.20953786614403025, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038198330880285674}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.2090073734779189, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0053710149363667935}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3759411168888221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004622209838017953}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.21611300792836619, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003996019817069793}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a376186758a017953852f3a7603178f4c6063b7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 1.1504048600305352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0755822004199928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.24380689117778276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006371373530201665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3952201821451148, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005033224197794555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.24203388956598934, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004720213442449205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.1418504594039488, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004575582136553598}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.21936500213450158, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004020020751334439}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.13474976693002946, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003415392488984557}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.21441718014302874, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005537317704553182}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.36454653195485753, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004645366026599087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.21539137146469606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004028386529865914}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.22298936285767557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005807349708841443}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.37135465454110034, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004699420865300012}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.222229551042884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004199234047990457}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cba0ec447efe3977c84faa8ac8ee46d929304278
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.037033597706448665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006480174033579744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.2658754336065589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028536856339901125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.06288652661017977, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009866252295731882}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.0029692249314187723, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00030354572703394786}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.023302986599903073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001796060317593677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.005046491430199868, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004836957271082255}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.03673799721474021, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006063411247670403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.2649595962209743, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002781644829670126}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.06244410003667717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009275108134304108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.026375944242512106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005028801609392026}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.19861431821513748, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024233727810549985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.04500463562063431, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007706525242560116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 0.014682233268574497, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.003688525019018053}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbf158f9a6d846982ca227c1d03d52975eb0ce1c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.4990908768667419, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00672293123926927}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.3805259635624399, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004958240532995044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.3790094382887789, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0046211045713230115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.26611927734105884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005475714121309219}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.19307213953268704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004131999445973537}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.194190380103921, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003903903754432061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.4143553380956953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005965933002105354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.31867009667119994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004444126461539901}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.31348326041784963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004037814530879785}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4395322981032592, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006214073376970294}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.3327332188496468, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004497692202972866}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.331045098999967, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0041576690263424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 7.474586456502016, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.39828351246520366}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6c0d957b0274ac2a20a3aa416a71c0699ef7866
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6659469751218693, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005282697210611973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.47545834078235505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004786244888221911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5116416549448322, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004114092104651227}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3975962388061356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005108496306917878}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.27663543585966394, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0042716661772286965}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2978341790256733, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004004300858372415}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5552137990129092, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005109973224126155}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.39635198024010654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004504106157915422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4246199460652537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003945834399770465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5888915350791896, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005163507906173889}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.41693152931630784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004490602016806981}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.44860309585930136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038862990100455236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 12.866414027645662, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.24833464289152501}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac63fa5c25e7f10ea8564ee0c4cfa7f5d7480b31
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6699599078220461, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005092472906400196}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.48581171511308424, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004848016885709874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5204893657590925, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003995053794749861}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.4035890102326372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005057285855009103}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2874311239495429, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004360261608760115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.30735803941946105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004029624004352192}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5579808989293318, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004963235286574611}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4043034901530828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004566274351211597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.4316278841611481, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038744793307385252}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5899222181000761, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005023121673406623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4243581128257757, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004524193813515205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.45443142624873073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003778925289442561}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 13.405106075117686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21265154164644462}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..dad50fba4919a07788cd85b4e33a1e05a05ba220
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.679557178036161, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0049896787371130945}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4926458422706465, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004854101278525955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5315805694115959, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004012328176756712}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.4129644172378307, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0050022305172101}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.29527142677608653, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004432346853014904}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.31723495338229024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004058983755190999}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5657042645684208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004851123687930964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4105814772991126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004560892673388257}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.44156064111814675, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003921560686510802}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.5995826786382206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00491266364396876}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.43150554107231065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004531011417146284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4657105547522727, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038479056635345133}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 13.745741838550028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13695001364890935}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c53449aadfe49339abfc04a05804cc9e306442b9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.6826576556901013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005035472419306049}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.496039516340874, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004919796581238693}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.5338363845067036, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004077687650001848}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.4176777178908642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005001554560141764}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.29901241727774297, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004423169847085888}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.3201869787310687, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0040154529017539875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.5695040159880653, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004915689277082928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4142614814635289, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004604637322440169}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.44383512761730387, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0039016002306109226}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.6022615757723847, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0049610108640678885}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.43461793110243807, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045641302637812915}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4674544381420857, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038204072100002187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 14.058124934994158, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences.  {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18995835187996135}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f2ba0345c98702e1141aaa1cdc9599e336f3514
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 0.3684892812116536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.036712135855659016}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.04055453069407477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011144496310485231}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.2383898367427312, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004435035822455946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.0639148965678454, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013744475750369117}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.013765451487632733, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004999336960221954}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.0819997006913652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002759657093416114}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.02212990258629694, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007608947170272215}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.037636089436926534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010064070012691636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.2262791588390035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004150274619964078}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.05950218629777505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012027985690621355}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.0343763430204763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010307718598180854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.19898079264280746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0039718249449850924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.05377498340164964, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001226466015770773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..889d8c095e5c2d7511b8a6e68df46e78ca4cdcc2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 8.146495790721922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3594818741787167}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.5473703579890434, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006234195978512742}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.42594641383154397, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004933350696797209}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.4232777622497728, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004489922497303896}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.3031075729839342, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005143255574055858}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2292919782826598, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00401177364228117}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.2275191754276285, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003794704955104901}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.4607152705533072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00570887962645297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.35839428699463327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004451667967473151}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.3535992424841204, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004013818428817113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.48504564210358153, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00589786396794517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.37384075852237625, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044950626203494985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.3709034906044357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004086212522565927}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..da0bbd7f4bb22923b82dc017effd4fa4ea096eca
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 11.965909947656085, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.31385721546757483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6536501149948257, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005409528765327417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.45486150118372615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004815350756377679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.4915502890453908, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004135175958914253}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.38878972701326525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005187583200225757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.26314783989707097, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004159223323165467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.28440321602825763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003906016823261451}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5524708911463064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005232101406770008}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.38340825054880595, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004460889502364635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4125596874295932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0038819772760533638}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5799349457482273, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005328155965064084}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.400491019067507, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044816598272816015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.43207188694214654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003876435956885669}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..591726ca983e6f969d1c9543de20e0eacaadecab
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 13.13186528743504, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.25017887243204956}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6617563503433895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005307576635275494}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.4672733883523212, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004779826795862267}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5046364417249789, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0041137000959081205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.3990665978536603, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005170131087123088}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2747669553241779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004209867424129332}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.29697113176548073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0040088345337766715}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5583282235127254, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005147780429733306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.3931717940655256, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004438132016764882}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.4233054491917572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003914713751938626}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5859572766342701, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0052220876338499615}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4098618562170075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004420858442059482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4427213766987302, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038669409347444014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ec536c0512bc08fe2cf6c56359c21d8755a04d1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 13.328668063663947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20856051056572658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.6710716761451476, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00511429409385906}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.47175315947396945, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004727398989004289}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.5147108147070596, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0040421844786449005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.40579962182414614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005013526043842869}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.27880930640418905, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0042069671751406475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.30417235473262294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003942481114293333}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.5660704860358193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0049534592472307865}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.39827745149126054, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004420352389146525}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.43294752280435206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003872021745790226}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.5962684608451612, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005023585380290913}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4164845623261301, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004403741091135774}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.4541200525443879, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038206569305483883}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ffe0f05786e641d0bb1077a443303b17d331e55e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.04944814629677289, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001497247190770107}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.3078611471396491, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005303327109994828}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.07979619965642178, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00202734988031741}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.018309292582639056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009363142372349108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.10973537747729963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003623531828256602}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.028978222976628863, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012880735235947889}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.04549819729320685, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001251826643585312}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.29237462013260235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0049507093051764355}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.07400909397863684, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001739473777782939}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.04274858858845132, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013570541361121234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.26685706989523095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00464097795566496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.06879897817639219, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018242627181544728}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 0.6689759196732665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05286049862282843}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..83c47070d4839cd80b76ec92b61ad55579fb5f95
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.4827311075483846, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006077411990486251}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.440560161659474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0052302780699066}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.4035238082909345, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004570572975206814}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.24944030688244834, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004802331976841843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.22747538576384696, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004244017550736157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.20506994083869234, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00374059622154692}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.3996179052833305, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005390496217652051}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.36687989332089216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004656909354140392}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.3324034993111807, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003956103883993652}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.42409893318395286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0056060227641647874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.38527217880573095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004721852532261511}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.3519063860696538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004091141865598614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 6.588467749744722, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.4032245951577365}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..afdf3c9fdce1d0e03709ba908e35d715241c07ea
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6218232316667821, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005485945035845688}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.48929591497649727, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004945740426108298}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.49917842830527476, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004251608550641214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.363761721700491, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005070900253455173}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2813926579070104, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004320741447701495}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.28603637886233174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0039899044556314595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5146727177802017, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0051593661561140395}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4058527142766151, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00455624921981018}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.41145761111444557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003932155746306229}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5445021126408139, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0052472546111600946}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4277096020477725, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004603985369731013}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.43449798192526906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003939338221863128}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 12.691912397052652, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.40247041652067606}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b0981169268d2131a330b3c787f95d832b397d1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.6539516131156088, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005120945883448623}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.49134395840583855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0049736746384781375}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5196984502923798, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004176674618016702}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3881069659684622, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0050054138504511915}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.28865850153467393, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004454679661879614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.3039382130147192, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004145367604949371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.542464954202006, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004908401431806659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4081820258818348, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004608939681390705}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4294401372716624, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003929315367564241}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5761616210022392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004963802338610327}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.4299823565839692, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004579751956132399}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4542192589085261, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00386272709242378}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 13.687688261339932, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19387514723356358}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b819f3ecb43490e95fef5ba67fff09385d8e8475
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.666787476202095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004816802985476908}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.4824167251076838, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00488554294646065}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.5205513700223448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0039244527177236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.3987364626703962, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004862803672938963}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.2846654569322822, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004369301517990411}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.30482876318174756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003890400311031704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.5557839457874688, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004740742530325836}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.4019048819024202, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004526221932495017}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.4319783380790841, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037571034362471243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.5876109824655803, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004760377125248539}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.42154188186799263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00450828933959814}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.4547951881516365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036898013140689865}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 12.977348547129905, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.22464862432323532}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..393e0eca6f73749d6a8ef6dd2c0882624c7d7ea0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.04063152255085763, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009054683111284807}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.19521133552459877, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038813865060671627}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.06052568382193392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011363889236589205}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.005054974672803902, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003777424396489787}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.0358093728893402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023699383967658925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.008292494676019994, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005734768199869111}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.03526164999991809, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007791455445977729}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.1711738377729294, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032942261964877483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.05237101199172248, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009524720060349769}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.03706225741025466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008484227497407326}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.1794475729484543, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003570964070190367}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.05504659493150974, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010385645573037228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 0.17302442690857225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02088479151120539}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a919656565cc925b101b5a2e53cc0d63401d0ad7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.5266983058761379, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005473280184276909}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4710372208511584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004978818208675463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.4486051628643562, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004239934043509985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.27552483520049653, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004546192682283346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.24719355667721984, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004206623227614193}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.23227922621904507, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003721636880083816}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.4297432923758641, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004966273860866354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.3857495217657205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044865340856255174}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.3642156831080444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0037789972737372617}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.4578436021001714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005085772605555204}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4085222771166399, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004575537252880692}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.38745765194476134, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0038618804528035827}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 7.4227316699041985, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3050632292572588}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ce2782c6165909d03e072ff1f09e3e8aa3cfbf3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.5860706534652017, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006142796102630763}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.5028936278021936, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004964031388238394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.47904191685874215, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004610233219452129}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3366055690160356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005205078374672028}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2817439584805951, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00430035974522067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.26883463217035797, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004056807034581447}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.4835386188892664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005530759641627937}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.41990625782772617, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004627778482080496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.39389153715288033, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004103096564742158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5126095254492385, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005737254659292593}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.43736947708111573, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004561221700465661}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.41563558935111344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004187263162417417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 8.626424905603868, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.30084585108448175}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..283465249a75617576e20660a1ed8f99110aea45
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.6363403403216966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005423277136161756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4955972233334783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050068211886118656}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5086336073047357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004319061668617563}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.3645817354081736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0049385804033239104}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.2825475726166293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004406165781571923}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.28809184008147987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004080653299670392}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5214189737717493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005060972606517546}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4081340048943837, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004601320296051857}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4159281452387241, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004008282134454578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5555281548058759, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005198395944020702}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4305573948340927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004617202953527601}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4414249404982676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004034492167499606}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 12.663443364165664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.6003687307907446}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9afd89594460f2e50b09b569b7d2a38c110c07bd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.657705603414944, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005103281874403951}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.4846039210835352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005102941916192306}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.5124136340794241, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004227034125527551}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.38405400325709016, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004889050644854438}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.28231903680054155, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004474461877919754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.29573775508453526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0040428348803391015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.5436519042949046, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004878771074784812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.4010305116437114, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004674690962415214}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.4219068270959287, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003940905009709809}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.5752358432393784, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004969306936657398}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.4212144900895478, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004698142381367077}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.4447700952492226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003945243545356445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 13.283879544265606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n  ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18046364359305653}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e343f10dd711ad27abc0c85e2bf90b8021bfff98
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.13503566455470678, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024040162474698724}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.23441729288318755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036599750426349466}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.15754810601459762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002445300070875694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03140531450622776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008248347072909491}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.05772646805767443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015906532638753756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.03727817770827085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009271630142541763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.09384208659305457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001689696736451482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.16871420996645434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002758914891553921}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.11016974085290407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016745228271743341}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.12513519617347155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022485478817274706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.2177607422267832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034240718038598007}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.145932266658728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022684805534109835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.4999917470932496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09136692821965282}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..09767cc3bb5676d27c747e64d4fe53f02ffe6e97
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.1505381571626013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024564237561880524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.2351558337379536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00345563635731422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.16455606393487338, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002294994166116859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03428549150065371, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010296385076122587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.055229584003465094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015302468074412797}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.03719814046102062, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009354291436813923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.10761813485443772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017831187131585585}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.17102216865216144, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025824199249945834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.11732986444478297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015793448331767731}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.13982406382793042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022970904948854606}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.21865517328256018, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032228483029092827}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.15264929345779932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021225644143411724}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.2010538746394595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08101371643710994}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..06003f56f9e0a08381deb481e7b1ae8113f50eb5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.1876293598203017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029674404218121616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.25230478447120946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033884919761666852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.18747231585436525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002366408430604848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.04975916902771021, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001487770429115759}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06360606183941207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015975217889021207}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04722227777064152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010961277893244215}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.13784498031331896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023167225723095476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.18552837609394215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025681044505090965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.13609982397073794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001695861644814157}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.17452434804986092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027761425038835114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.23522852871681593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003178841751082744}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.17439134256209984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00220237936065734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.840300351651135, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07130210999169914}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee1f46bf375eeec5b2ebba375cfb7dd77a1c73a1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.1839935150260243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003346950875411787}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.21800826428765932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0035395802723768343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.16819036448780483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00249732983792147}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.052348258405649814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001782889854536389}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.058048612213817206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016463744725270463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.04446177615823931, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011470710110295435}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.13974270455081728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0026867179254436427}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.1639987000897844, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002733365684474388}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1254498650320625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018563840983896952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.17192952870829883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031753297530428574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.20270410550643933, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033069811684398828}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.1564584387566433, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002329587318436108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.778235976798028, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1174187394147134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7aaf6268a617465489a9f5f09a542c6c7fecd218
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.07175606709271613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002883021045304814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.07370835441795094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028219227214772644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.058775945272166535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002128501293673969}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.021225726932429398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001353702211218657}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.021001218125129863, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012128317967941386}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.016525925525545243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008902078561462972}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.05572046585928227, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023257534664432025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.056105537285426776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002185488741676815}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.044519899911493195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001620071457669129}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.06641449373056402, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027052374207703375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.0680432273603457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026209040416315887}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.054134016173616836, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019690557767872534}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.19229073122355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.021891114094233034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..16d1bc42702a5e91ed10c9271fd43c4829dbc219
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.014860040829321053, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016169810385583425}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.011501893332992012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012000235529442773}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.010197681105541754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010205284597569913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.004286038547963845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006750544266144003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.0034999394129427755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005036071046640206}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.0031538730485515167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00043997651618954913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.012372385966053627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014249396336128456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.009162693605863482, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009679537038074917}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.00811132739080454, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008257902852490833}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.014160515487803211, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015675837920299837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.010763669777769231, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011239509571883462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.009545008500985965, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009573811061672583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.5334089241458653e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 7.473669615841683e-11}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8813e128b7fa2ed9f51c8f1ce9a125dafc4a40e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.08127870970387267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00159496450472922}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.11614055304212574, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00211819393542503}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.08751176126000156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015289149605506355}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.011010781180132406, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00046413206931745474}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.017353929713234406, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007980921461916622}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.012313525594602524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00051069361487657}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.0670588180579985, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012403365589683047}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.09825523452607922, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017514381154861217}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.07274632268029439, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011790060287316787}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.07601033342044172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001489877196693457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.10896403734860502, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00199374637678252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.08188486629306464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014230045630912664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.7090027829444067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.059403705322662224}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e1eda01659ad7c14765d263849e97fa0b71de36
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.11718371549480533, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019207525066087428}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1234378142215776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021119106085494836}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.10439627151445018, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001492503209077825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.011169246181700449, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006980956389803306}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.012611453444703798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000789477613635532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.009754190532310308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005251092959860216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.09349645401843876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015339441744450736}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.09820200472415783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016478612439534423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.08249721608530192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011129719074541209}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.11232285776375517, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018301585055001148}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.11825003875705886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019913607518053916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.10002969343936566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014109391973704906}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.7784287218692226, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04647782988951344}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..47883eaf631381034c067a2fb85bcd07c7783307
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.1758303182294891, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003462451959062334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.1883786913476069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032322227595843827}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15023928960073749, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023557832440037887}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.046013593879532945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018013389641468377}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.04570020004067074, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014354529428897169}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.03619017321536887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010416765949087755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.13643697056796278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002803241654465616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.14527010548054942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002530982991259681}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.11461382411831511, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017522041790913572}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.1648841707672492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003290762730353922}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.17601314551035446, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030249867013223885}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14032663089975259, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021996161120837656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.5720898763538185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06900520135924704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..805386e4350ec24f8b3450fba1032f59da5fee52
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.20336271876650794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004029433819785622}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.17729454008110915, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032685763318790126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15150667381006758, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024684929994384665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.06253967117819685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024298335038909547}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.04711446698399212, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015238057694360616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.04099567404353483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012134639956742775}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.16300354995627306, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003457637855102496}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.1376210474869735, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002552749011506665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.11762153288067587, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019023444904376398}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.19133776900682994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003854597063658495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.16579113722679306, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030775732756182375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14165398317895528, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002319209416009038}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.40664978256626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10165209107523865}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1d695fc83a86584e40c1fbb7dc547eb446be1ce
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.07570691992215821, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031769105627284205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.059481738771106525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024694047666392245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.052169934920531964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002011618006335426}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.02312729904071614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016638788683403377}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.01723565302833218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001102234180757535}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.014696313352161674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008734069534130452}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.061644364211395554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002707542539889897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.046759300119261486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019578754806582866}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.04097311985693082, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015825649237898086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.07138944469911805, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0030407812924645727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.055567400646258695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023184097053874945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04866449318467239, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018776959939917014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.08798252672366903, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01615147537721119}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..682fd8942fd4d12394f75fef13d2a9e0c0c1be44
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.013286397211624563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001504677557742547}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.008705964916807795, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009664416727583056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.008355758385893338, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008892610122505367}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.004518509944468516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007701323271558087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.0022539281604353444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003261469102752993}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0023400550715016594, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003377640465114735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.011288412771902762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013224615413419574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.007132645715713445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007894051530351685}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.006807029265944781, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007179828918381072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.01271335891292784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00145484420282144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.008225269451860548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009152493951939503}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.007873172278594525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00083319726831352}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 7.837525856560887e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.2469168934061554e-13}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6844eb49d8d497fe100af1e7e65888359c48a911
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.08356461827231487, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024170033290248937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.09215261028156187, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002030742204450942}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.07457954055741463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015706133068032748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.008089449665266878, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00044542591051177844}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.01060270864330856, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006135532813043637}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.008251799871683644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004400505526509918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.07378864225361224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022398054335912194}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.08084815666892478, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0017354953004792963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.06490017831903919, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012990934205015005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.0773053527266884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00233032304469025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.08512472218977873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018927606261400106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.06843881980738721, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00144201624548823}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.4965418693062675, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06336636372085828}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..03da3c6d6401a2b19eb29a04039095013411d7ad
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.1281252372198017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020477888133567127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.12497569074861344, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0019520455760729037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.10906609141044966, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014081519487517005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.01185449066902512, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000840457069932496}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.010852632040411836, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006918502465566141}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.009018221770209143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004951723995251364}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.10275230010792302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016619122004975422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.09981624192731062, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015160011216860429}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.08664583891836886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001050482144420516}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.12279548165800759, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00193310449452201}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.12016569662446316, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001852297919694553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.10469247701628683, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013253125566948223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.7256071294825857, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07948879643932899}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6d01dee64639e6d9079b45808890e045e1f84cd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.2130994513227411, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036353936247548735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.19599404536132142, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029965609835451374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.1676452338322572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002227551090748977}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.055485760051538865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00215799809893319}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.04479590241268949, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014249893850712678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.038230611510681065, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001102150265059144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.16773450143295235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030808144747857297}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.14980977816188165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002281955305092778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.12791823608415107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016384015342240182}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.20253341704063588, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035146252518724183}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.18480217629421566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028137217195253915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.15830475139273484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020952349680806023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.6332112820833435, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07276165331548821}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe73471a9d2acc8d955c0aaab1276a819e0b7c6f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.21405966081177125, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004161193500566949}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.17673627728619523, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032206945936224464}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.15504698516533022, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024750219681964492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.06600877424854136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002513661939389598}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.046653184328022644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001511363884859671}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04072525071554273, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001154064126239323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.16992128001744655, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0035157095180261316}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.135841050753072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024938095859423927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.11896885200232418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018574676959350028}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.20278412255856923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0040134091234927125}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.16584081593068659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003022250624756856}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.14567952073605564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023259948668787433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.405742021443971, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08765857428125907}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fbe87c6945fae5a0acc7e39a06bdabc2c3fd8d2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.0764117688968914, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003310015328209796}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.05500790234723034, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023533448786149888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.04985418381603688, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001960922158649172}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.024322633331148896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00172947059164052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.015488676770406946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010366733748569117}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.013743928699783595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008105948544237309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.06259275682135215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028290362323728545}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.04314953057223309, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0018562044556596725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.03920084186369486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015347913250690194}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.07243692089182784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031760715288622755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.05161651824502423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002210435572023429}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04674099530422977, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018346514063613708}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.04603112857491259, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008088785186144275}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c74e30fbf5c230353fd53b1d222c11874aa34418
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.01158797019081859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014113661523309372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.007589223568536127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009189129383816963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.007387558901390138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000843094067827481}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.00396343343286023, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007692116628974225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0020173657517965145, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000354114887572235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.002066764939323049, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003674443571385706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.009881225809305108, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00127413856729575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.0061423589274229315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007464014877194372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.006005050518755766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006965777440156409}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.01100925224171755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001366746162025473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.0071530057213977545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008733261333433707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.0069366139221113485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007971177406173816}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 5.838488988667251e-17, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.1928335157223854e-15}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef8ee1644e6d14978da094725ff23fc7d32f496b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.13065355742673593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003004776280579319}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1786059794590456, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003480497255310532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.13547708241929612, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026556069493799896}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03291780593276675, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010760829551416996}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04663674589032859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001504992918818976}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0352234336154376, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010798725770486046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.09987150235228559, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023685337113851174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.13959473564876165, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027581190918920103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.10336654675385076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001968971023431868}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.12288788441024788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028720387887482485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.16759622234914887, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032867178590762366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.12705867398208964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002506787857940285}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.0868626526463756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10334620651927755}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..27593039b0a4e0bb00177a616584d05f0af99456
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.19126686092152348, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003082884423906421}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2013507500089625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029925213818961095}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.16580650031794167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021193004105035614}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.044582320866862424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018321118047088484}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04442769282883079, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014782140733801531}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.035746404962808155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011192977008782524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14988887413463797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002586002929489674}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.15563614848002308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023345594461348924}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1274347165087169, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015926173758816556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18119105294163743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002938341173461725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.19011415446724061, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002811944286183604}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15656227686437413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019872168695675768}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.5514775583592173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11284261690010776}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a271395c9211be3a3709b0790873fbeef9116d56
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.27518353034811777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003681886785126964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.26994010795497553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003076759192772023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.227527675745211, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002262727638760964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08493527530542791, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002342907743897752}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07562522541046568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016670921194054875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06444156598905733, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001316135695695516}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.21378447499783487, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003086057725507419}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20706303996123673, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002420883132061566}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17356357206141904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017138094026774846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2594677476381253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035263951440822477}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.25382240907884707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029022538349274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.21385148741041204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021335293397074695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.160843322149366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1298621943311205}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..48239d4f740b764a08f52bb58a1eadd89eb82268
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.25197379414278026, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004061114481390069}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2241545120711431, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003349453782690701}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19542809253461213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002567437085860653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07748731951936201, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024143985835070676}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06396850914384923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001702594494628315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.055649339745118376, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013436864732712206}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19629979985917206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00335622576475892}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.17220070773118773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026273647249788665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1493802268370947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001958022255827534}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2374958020095092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0038790177970112016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2104711045184697, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031578834852280147}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18360399651239948, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002422426009983074}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.550269268318738, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.123976667805652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..708fda4d13c8f238ebd894cd3bbe9ea35dfa0898
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.08736393590844754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003320236333651585}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07170734676784853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026863536115591326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06369530530813045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002225322220412946}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.026539844024013316, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016227872678637359}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.021485803029836836, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012234458979275497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.018728966913288573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009803445596888972}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07026197905523278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002775489585285312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.056214726744091746, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021292176604441565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0499467876025126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017576059173842731}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08238350525240551, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00315393172421982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06716878885751017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002524004286911468}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05976166581566339, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020932837351865146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.18727975729565796, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.024743677554065382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..601ebbfb7c13961eb7820076f9de240c48eb5c8c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.016308055800076632, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016664962689093485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011189655817252247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011802896184760587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.01043565444271075, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010239464330389257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004996639391364696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007412751630573523}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0035850495924414864, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005653334006649237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.003212219718708495, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00044504346236305713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.013336917042039756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013953475029781011}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009180912567410115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001001081279334972}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008463888867128002, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008498965362098245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.015695607203614483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016199770379973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010692369328543307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011385128958340267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009948416660924693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000978895492286139}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.577157241826622e-13, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.1569346422902409e-11}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d77100a5ac55821f6103cb671e3aed80cbc81d6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11966632209944479, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018289026636228675}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.17699917791063266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026232545831137728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.1301753308413737, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001845181946043979}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01769714030024297, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006122997523884595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.028670426399557025, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010678780392830523}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.01998190512680113, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006699945809559366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09438727023667497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001303032090526388}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.14103328454918124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001963566607993366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.10242209288458175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012780368890878686}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.1118510836616567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017112676227580969}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.16569303942590743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002457833604587797}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.121571478712092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001719907239916338}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.03372792986763, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04498324238148764}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80b2c25339ce30bc5a8b20fd6c61e6b8afdcb2a9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11863961702545356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016088988795161811}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.12008771066629581, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0017439629461919743}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.10502771804225536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012846977475557287}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.007275016003204525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004169535248542494}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.008322730126316197, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005516341243080924}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.006731410059333951, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00037866401637734617}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09461567478817941, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012475184392361028}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.09556473399275549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0013339685617020215}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08310986365787817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009355454004341337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11442130354097658, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015357165164883602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.11581351195921423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016588658424958223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.10128744060086141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012205159612731504}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.5402539021847423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0624145046719431}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c0cb83bad1a23a826eb420d942393ca0607e68e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.14826479664329056, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022603198325511016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.19594580754248062, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002751527088887584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.14860219897828134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001917302371146199}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.02493138347514756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009874915174681875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.033473322025973654, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012512631437445213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.024563417541210154, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008202915878607498}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.11089102163547263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016898972686976235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.14784069855662121, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002072707847991526}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.11041398908642593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013335422583366145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.13993502338514574, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021373400435202843}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.18435209978526113, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002565912635880525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.1398773618649369, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017851823888757378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.623453279222236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06273494818292862}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..05f3e101e90662011a223070fef73336579e0530
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.12219509555835543, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026166419024163078}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.15850063378476156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030215463841090718}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.11745387491243063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021004572141077632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.023677234953105034, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011613452323130811}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.030381539120417637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012727961384364653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.02127276979142645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007674253666113632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.09430746394042341, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020959207990629792}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.12231473215281553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002337973909999958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.08912109764317326, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015176601962052629}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.11499260596518415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024911178674758127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.14866443294458095, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028317609417797232}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11001570950646093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019597563244517636}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.4377303685533975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11251563995967095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..38b9fcc2b629c60a1e887b6cd8d8ce8fbb8a2797
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.03310867480312166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019161965531372164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.038587507598865006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020099564019102865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.028565135483987054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014237764447699868}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.008639108153884063, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009900284053428036}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.008463237849863133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007375666493237083}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0061590533339769245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004920915450177505}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.02658430393615535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016197062516045348}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.030269053997186237, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015723260820266977}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.02217628719189859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00108264414651739}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.031188351551050795, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018354648916057872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.03588875149470869, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018627048019628082}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.02663365521272406, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013268975899176129}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.031709892809790814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00589379637253143}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d64ac494be73944de3cc9b530316b3a6dad99137
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.003641954878791069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.000801030571157579}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.003014962652659718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005505619854307441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0023856746059350152, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004343793032224447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0010633847104680439, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00037205433457009767}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0005105251875900575, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00013313252308758052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0004938815252314359, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00013206141666044304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0031336768330716977, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007374760132829963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.002362031253209354, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00041493245818900585}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.001865391120809492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00032336237332238825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.003559126075504872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007918434685765267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.002883544387289442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.000520757291267085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.0022948013067646622, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00041592013921203823}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.7895417015749206e-29, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.71058847601332e-26}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..670895cbca889fd5995540b18739e6981516abe5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.33, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01487687202745673}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.321, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..424fdb4c52a2f184b02e41cb1e2ebe35b4bdf62f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795025}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.334, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01492201952373296}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8041573d8ca19599051e8c9d08050b3b2b9c9f3b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.338, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224475}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932575}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9870cd89ab532f7e0762b8b19142635dd314b356
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.351, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015100563798316405}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.358, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015167928865407557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..58cd407686e912f72192b6f384353018aae5742f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.332, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811485}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..50dc31239ae95d50fb335ae75c6d91d5f88f1618
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.336, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795025}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.331, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01488827258820394}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a79b1a6bf125a0af8a5903c75f97bdc1d05bcbb4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.331, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01488827258820393}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.327, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411245}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c368dcd53eeb2efff8d37e1b1d3939a9a4b2dc29
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..642ff253ef709a030274d1b286aa5cdede88fbd8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.351, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015100563798316405}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.353, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015120172605483696}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5705582443e8afe4ca5e37fb0db7006a8aeaf237
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.341, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014998131348402706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2a3f69abfecda9a863c30de828bbd19f87ad4e6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.337, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0149550879186536}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.349, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015080663991563098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ca87ac12fae408c917fbed7ff14144f3f02af17
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.357, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015158521721486773}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.35, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015090650341444231}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f62b07343d2052da590a71b5e4d0bcefd6ac68e5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229857}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.334, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014922019523732968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..395e692e6002bd28514dbbf280ac8c24d44101d4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bddc6392005bf34f5c5ff3ec18cfe277c40b4863
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.336, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795027}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014794927843348635}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a09518ea6219726135063d91cfdef232ab96077
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121728}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928373}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fd3ae5525c05673f98a5644cd2312147d95a633
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928367}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..457700250b55d7d2eeafcdddd108ce9159eb9ed5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.307, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01459328489285263}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014806864733738859}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..68744d2b0d57434bdea1b8bfcb170c3a13e16692
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.333, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229857}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.328, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f339d5dcc96b51a4b8c67fe10dd097e9ff505c5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928366}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014876872027456734}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee85ddb2d0a948ccb107cbe0578c84395cf23c09
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932573}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea7e02c37265eb461b6995d8c01f217928e0e1de
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.326, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01483050720454104}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.321, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934645}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..46e252a45a4a168bacc599dbc517656cbdaa7873
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.313, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977885}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.306, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01458000605543697}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..350efce1890711a51475d5d81d411586744af8d6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928362}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..15146878904a03332aa9bd5a3dadc8d248559117
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.342, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015008706182121731}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811475}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e031594b0f4e3a87b05057fbbce6c54ba19d453
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229863}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e33b1845d6587b5b491c318aa02fd1b98dba2682
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.341, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402707}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.329, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928364}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..055592383b7c21387f77bc528f1185dc57e61a1e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.348, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015070604603768408}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014933117490932575}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e35bc0ef241d01efa802479569b304e8eb8cc0e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.338, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224475}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01489959724281149}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..026e8be39911f9c72dd3ada75aa5e2984bae8d74
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.314, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01468399195108797}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..53ef2659bbec26e182fb28323d24cf994545f124
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.325, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095527}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.354, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015129868238451773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5f69b27413eef935488c7faf057227eaed8f48d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014910846164229873}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.331, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01488827258820393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d99796b22cb52ef531c68068a8da134bf70de608
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.318, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.316, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057142}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..20ab1d9ce540d9a312768eb4121114361147e3f6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.317, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014721675438880215}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.321, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934645}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2197220bd1391b1e2618184ac38ec134337497b6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.315, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792508}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.316, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014709193056057127}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c33f00e9ccc541d730fb6c98a6183e9256d4d715
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc": 0.316, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01470919305605713}, {"task_name": "anli_r2", "prompt_name": "GPT-3 style", "acc_norm": 0.313, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014671272822977888}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..20571753833be31ecc3bcbf1d0c68c40b9e4e67f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.336, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795021}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.332, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01489959724281149}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..93a69aee799f6af3c67c952e69d6304eef6cac20
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.315, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..477ee58cebdb602a0abe19039707d8659f9822b8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.311, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014645596385722692}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.305, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014566646394664385}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..99453190246aa9685d1b8b209cf196a1d4a4773e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.292, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014385511563477341}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361428}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..59d2327ca677330cae8ff3338a720db033292dcf
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.303, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014539683710535269}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f4b09f9211efa31e9f7cf7cf9068f772f0d9165
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc": 0.296, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014442734941575018}, {"task_name": "anli_r2", "prompt_name": "MNLI crowdsource", "acc_norm": 0.3, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014498627873361427}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dea330b5b22a3eb6828f13bf855f0367259868a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653616}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d69dab3852c211cb393495d20f381a9e7b782ae3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..92dcacb5ca772ecbd6b8f3835c7154922d20877d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057142}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a60f36b1cd9b1f7dcdae2d332a5c64a920aabb75
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01488827258820393}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014853842487270336}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e75ba8a1059c6f4be1c37e05b8fa9d911dec697d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014888272588203928}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01485384248727033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2404c788f9374d8b269d55fda27698357633c85d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "can we infer", "acc": 0.318, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r2", "prompt_name": "can we infer", "acc_norm": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541042}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..92e8923317b806b0c6f7a01af2324d5c06fb8985
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.332, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811483}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.337, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0149550879186536}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1583003db21f3266e1114bee8f84a5deadb97bab
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.314, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014683991951087974}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.312, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014658474370509012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..094b973dbc9dbb3962cac0057ea6034c12997624
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.324, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738864}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.325, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7df3c4eca272329a35b677aada392077d06c31a5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.323, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348635}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.317, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01472167543888022}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ff1e7dc31c121deb54744fc8d6ba6b2404b076e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.324, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014806864733738859}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.318, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb2dfeb4283f27826986b8060566a8939e5e890c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc": 0.335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "anli_r2", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.339, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014976758771620345}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8652c5315050e9b9b6688efa6f308833f6212f37
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.345, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015039986742055233}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422987}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7683c515097b23135839747909ac71cb02b40ad
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014696631960792506}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.315, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014696631960792506}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9447138402b85262de897bfd2f7b31a268ad6796
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014794927843348628}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.331, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014888272588203922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef98bc83d1e09fde4542f8272bf2689662401ece
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.337, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014955087918653595}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014818724459095524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac258ec06511e4e20fac7e378ebe1948512f030f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.323, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01479492784334863}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.328, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01485384248727033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ca63198c2030dfbd876ab6d85a8a65be2a4e8ee
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r2", "prompt_name": "justified in saying", "acc": 0.316, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057121}, {"task_name": "anli_r2", "prompt_name": "justified in saying", "acc_norm": 0.326, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 2, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014830507204541042}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8074fefafab595b80283c85ebc257c7e617e0849
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3283333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013562032919529019}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.34, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013680495725767785}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7901d51892563e0a0b8e6cc008e973ed9fb58685
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.345, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013728421539454876}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3416666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013696658778002519}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..56532a3089f1a9bcc6eb62ecff83cdb870b85d0f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.31666666666666665, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01343407866082739}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3233333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01350837286730022}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6952a2404ea982d005664c7e43e50243ae87849e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.32916666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01357080625843363}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca8cbf4ff6eda9adacdd4c654d03214069c35d49
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.3225, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013499258621103249}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3333333333333333, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013613950010225601}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..01f7aef2f33e95aca2b0cc8e083beafd6d045813
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013630871843821479}, {"task_name": "anli_r3", "prompt_name": "GPT-3 style", "acc_norm": 0.3441666666666667, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013720551062295756}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..785d168a9d433433991993af4c804e7510e053e6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406389}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.325, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013526454480351028}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..608d79b25ab2005fbc847ea82263d8ff231fb4f5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..78820f22e85771fc23d0f147174ce4dc06c1a948
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3258333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013535422043417459}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881638}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b1fee6e7421ebdc83b3d5cd55ea993d38895ce7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3258333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013535422043417464}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013471620929769144}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..705479f401ff8c1c67b5f1d6b8d5f2f4f3c90358
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.3233333333333333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013508372867300219}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32916666666666666, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01357080625843363}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3702ddbbd05342bec3c67860166ea37bb57e801c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc": 0.31333333333333335, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013395739415639082}, {"task_name": "anli_r3", "prompt_name": "MNLI crowdsource", "acc_norm": 0.32, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01347162092976915}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3afce0beb68c39e37a5bdc04b7e91122cafcff69
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013696658778002519}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.3308333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013588208070709002}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a538c6bbb028ac7ea0a37408c7182d732242923a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..68878273fe93ea0c663fb083ff1c8da5e9b4ffaf
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013526454480351025}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.30833333333333335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01333672114313647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..20b800012921f53a229071e7973ebfa59a596008
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.3433333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01371263383046586}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33166666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013596836729485166}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ca8d917d30407ecd93c8081adec3bc94df64506
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01363087184382147}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.33166666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01359683672948517}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee2e8d762a2c34fe97652ae014587da1ece15d2b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "can we infer", "acc": 0.32916666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013570806258433618}, {"task_name": "anli_r3", "prompt_name": "can we infer", "acc_norm": 0.335, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013630871843821476}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..290efe82b8f28e68ace11d3afdc2f131a73283d4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32666666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013544340907003667}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31083333333333335, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013366457845965445}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9a5726c325c7fbedfdddf95a12a09670b86c1e7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01357953127780092}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.33416666666666667, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013622434813136778}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..41a979a21af48d04a6c755be0b04b302f6d2bc29
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.3225, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013499258621103245}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.31916666666666665, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013462309712005134}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9be47a27674afc21ef15b2016a7cf802ff2b91fc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.33, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013579531277800925}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32083333333333336, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013480882752851557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2ff51a357a673122a04dc6d75a60ecfb1e816a6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.315, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013415009084004866}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.3125, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013386029277441229}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..229a348d503d8ce5cec87a64ccf9bf3cce742618
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013471620929769145}, {"task_name": "anli_r3", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.32666666666666666, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013544340907003663}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..61b17ba766bf80269de7a8ba412bc2e20dc3e6c0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3416666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013696658778002519}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013605417345710528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..415262e0fca6e1a1f703e591360440af98b0f6ba
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013647602942406393}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33666666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013647602942406393}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..595a56cdd5fe5421ed9606695d58cc0aa85e2154
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.31666666666666665, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013434078660827384}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.30666666666666664, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0133166423190707}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7e42d619f1f1bf8f056a5a5f5d87e8013a3f498
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3375, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013655897185463657}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.32416666666666666, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013517438120881629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..119c1aaf198ff7e4eaaf2f2c8d0f574dcc464ed7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.013613950010225603}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.3283333333333333, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013562032919529019}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4254d228522a4071a728f94e26cf77002a891ea2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "anli_r3", "prompt_name": "justified in saying", "acc": 0.3325, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01360541734571053}, {"task_name": "anli_r3", "prompt_name": "justified in saying", "acc_norm": 0.33166666666666667, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 3, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013596836729485163}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8058dc6354e70b1fa6c04e91a2e518d5d53b2de9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22696245733788395, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012240491536132873}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22696245733788395, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012240491536132873}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6786981d058d3015aa84328f6e8ae0c3679ce05c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.22098976109215018, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012124929206818258}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.22098976109215018, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012124929206818258}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..987ae9815b51f5cf823b3b5134c7bb5cd98e1c78
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012368225378507135}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.23378839590443687, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012368225378507135}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..46d0f3fcc9b2d09df67064391f6d8d26a5c5d775
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0122889267608908}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0122889267608908}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0283fcaec783cc7cae95f1c7c5c0b49a9b3c166b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01266819862131543}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01266819862131543}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5be81c6e9abc15da8880914c3303e5d80d32bf2d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0125810334537301}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24573378839590443, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "540ebc31-2ea6-4feb-a6fd-67b6e71cf20a", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0125810334537301}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e2f07d3fe75cead11b9248c91e04e6c5fc9310f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012821930225112556}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2901023890784983, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013261573677520773}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4ac4ec0f71700ebc259bd8db3807aa8f7d2bc09
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2568259385665529, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0127669237941168}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2960750853242321, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013340916085246263}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d8a8ee641e330e92f7a845c1382b04b276073e3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2696245733788396, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012968040686869154}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.29180887372013653, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013284525292403503}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b14483d44f4e484bbe56a55eaf750b4ea76b1ab3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.27047781569965873, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012980954547659556}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.28754266211604096, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013226719056266129}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a10b90c2d901f9164958d3ff11b416ed0d983a5b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.26621160409556316, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012915774781523223}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2909556313993174, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013273077865907576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b6672a9a880e91477afcb6241f3e5733453c998
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.2508532423208191, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012668198621315433}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ff84886-9d5f-40d1-80d7-2a39b7c16ec6", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd93de14eb23bab77baa466389b940feb1cfe7af
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.24744027303754265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01261035266329267}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26535836177474403, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012902554762313967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd1b7a5525142105b1c1f09959097bda296423b9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01230492841874761}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.25, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012653835621466646}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8cd7c01ba4626d8d4d0ea70c38ac5dcffa0080d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012256708602326917}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.24061433447098976, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012491468532390578}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..28459bc59fd78d4c5643d5efa85ce7f9d015e189
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012320858834772273}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.23122866894197952, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01232085883477228}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7483f1b4adf6012069547cd918c4a5886492d8b4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012256708602326914}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.22440273037542663, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012191404938603836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a71a104c11118d4635f4a5d65f6926e61cf3401
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2235494880546075, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012174896631202609}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "ced2b33b-b590-4522-b041-51d7dd669561", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01230492841874761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6713035a140e09bf3d40b370c9d2274998f4f30e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012304928418747611}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012304928418747611}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..59cb568eba5d0bf59e894890fb201db526f72c75
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012272853582540794}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22866894197952217, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012272853582540794}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..32df0cb0112927dafce368f00e3920dc5dceb9a7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012304928418747611}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23037542662116042, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012304928418747611}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2634bb7bf64cf5eec7b9c20755b9cc72db21058a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012256708602326903}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.22781569965870307, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012256708602326903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..77f94d68ab5c304d8a4079aadcdb0d2aac0de33e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012430399829260861}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23720136518771331, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.012430399829260861}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b2cb5210edb5fa9851c77e4cf966ede005c13a4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01252159329580012}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24232081911262798, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "e371fc1a-8edb-477b-b345-9d73e97ffade", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01252159329580012}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cff9ca760a4df59216faa975af786c1118b25eb7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2636518771331058, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012875929151297065}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28242320819112626, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013155456884097218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7747288daa08ace88200a42504c40d0a8a5eefff
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.25597269624573377, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01275301324124451}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.29266211604095566, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013295916103619413}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..caef94168d937f6d8ff44ec5329cf811fc6908eb
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26621160409556316, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.012915774781523231}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2858361774744027, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013203196088537367}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..433cbc5d7eb6fdc059b51abdeb22c0e07a180f9d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.26023890784982934, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01282193022511254}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28924914675767915, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013250012579393443}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7317653da70cc8c2b48dc2f44e7ecf05fb093609
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2627986348122867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01286252317535133}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.295221843003413, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013329750293382316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5028ff1fb41d35b4ebf04e57e1475de93ea4a443
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.2619453924914676, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01284905482685812}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2841296928327645, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "subset": null, "prompt_id": "5ec2b8ca-e4c0-444e-b097-89ccce811550", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013179442447653887}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfd5a0b73401cfa5b081c1d16192e86c5a558a74
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008914948991495706}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25252525252525254, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008914948991495706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9eee54747d0e62ff312d698efce1700ef090fe3e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008844984581934903}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008844984581934903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1382d4bbff7679f1e627b7ce7e207fa0cb273c2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.2588383838383838, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008987501845758058}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.2588383838383838, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008987501845758058}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f100e63b13bc23601ac50b1cb810cbe46fd30ac
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008844984581934903}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24663299663299662, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008844984581934903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..57380f1539cd54a1278a4404d934776cb6b9f8a2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00880917174472056}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00880917174472056}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a83fc4c12c66488b0bfcbb158d7955000d4b734
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.25336700336700335, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008924765424529264}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.25336700336700335, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008924765424529264}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1584fed66bb7a531985c9b2d1809b83894d4199d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.375, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009933992677987828}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3164983164983165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009543851857323891}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1685f3c15adb627897d9b03bfa9fd20e6e9e5dcb
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.33585858585858586, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009691180932083506}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.30008417508417506, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009404000558513351}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0417931c1a8fb260539b9e5b15fab4ad9ea5ef71
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3261784511784512, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009619849417035182}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2887205387205387, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009298805565435511}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d243c27253dbd61e6b566538dc5ea49bd5d75588
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.32786195286195285, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00963258707617002}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2828282828282828, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009241472775328228}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6e65ed5a6490a2a89cba193e1d3cd75759a7535
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3164983164983165, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009543851857323888}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2904040404040404, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00931483330293628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b18bfcf1a1a695e47a2db033dd66a3ab80a1b23
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.3148148148148148, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009530150430975598}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.2937710437710438, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009346423298166722}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0143b7ca11289092dbb88a88c014c3544688f95
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2840909090909091, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009253921261885763}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26557239057239057, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009062210626971845}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a4e766d94476fa314055cd178f755c34a104b18
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2638888888888889, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009043789220055136}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2756734006734007, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009169229476542565}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f8978615c78a0fbd897a6347e6069329c6324b7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2668350168350168, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00907591585926727}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.2807239057239057, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00922052617471136}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4aabc4eeefbb5ecdc7978b081f05f88725598a1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.27230639730639733, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009134218447652678}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.27104377104377103, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009120919741760597}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..757cf54ca72498f47b017ca64b96a03fd8515e26
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.26262626262626265, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009029861776763752}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.265993265993266, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009066789565615694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b937c4841b6f58f447681875b71936355ecde19
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.2537878787878788, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008929657065808293}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.26346801346801346, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009039157374497715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f0c311380b22c4f13739efa8b25702a0c1130be
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2537878787878788, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008929657065808295}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2537878787878788, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008929657065808295}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..13caaf4d41fa8cb4e6093e4912b95618d69ad9e6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008809171744720559}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24368686868686867, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008809171744720559}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac02f3b19286d1be445cc3257517634179f26564
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008900141191221648}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.25126262626262624, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008900141191221648}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3133dc3fb63d1d663c5d65041e21576e60dd8dc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008757032594354034}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23947811447811448, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008757032594354034}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4badba044aba6ee973d8d7f72257a10aab4b8261
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008798836444222033}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.24284511784511784, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008798836444222033}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..561ef8695bc88d3bcff98ab546a7b775d1ca69dc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.2474747474747475, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00885511441483471}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.2474747474747475, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00885511441483471}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..43138457b05834b5471f76756e4aa98a614dd765
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3472222222222222, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009769101679700909}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.30934343434343436, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009484615220606831}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..09d4b8911f45a1d3db98da589f9465b1cbd2914d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.32702020202020204, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009626235849372194}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2916666666666667, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009326752065621162}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..31db3e43cf453990cc6daae537df10209662bf16
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.3287037037037037, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00963890316702216}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.28324915824915825, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009245632200075455}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd2410da9732c40a3287e7e70798c484d3a87f9f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.31986531986531985, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00957082182057359}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2845117845117845, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009258050925618821}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..235a063c6b74517e66bf67b4a89d82eec8a04da1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30892255892255893, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009481048387761353}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2904040404040404, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009314833302936282}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6febc95d3b9cbc319d76388ce38734a01bcde598
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_arc_easy_qa_options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "arc_easy", "prompt_name": "qa_options", "acc": 0.30723905723905726, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009466688832475374}, {"task_name": "arc_easy", "prompt_name": "qa_options", "acc_norm": 0.2904040404040404, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "252aa566-9482-4e81-aad9-664a9bebd8e8", "prompt_jinja": "{{question}}\n\nOptions:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00931483330293628}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..29ce6fba5a6afca92e207107c0f5bb8ae2a60d00
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5883333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008986619341172336}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6263333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008833986042519329}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f29a5068bbbe088627f254c9ad03b16cd0ec3f6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5896666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008982215188519145}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6033333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008933122315228992}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..053a54f76408e6e8732675d737ab8d03717e3535
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.587, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008990955404907169}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6156666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008882569490543049}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e642e4b22ecbc8857358bd59f32efe0c2da5b5a8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5986666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008950698369218394}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.611, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008902401412932078}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb95a5b7a615c800724ba8362fd93ef972c583bd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5913333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008976614094836194}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6086666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008911995272576809}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce8658fa080e345947a51aa6c103332b440a6de3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5966666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008957972256087361}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.602, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008938230472973836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6594c5ea46424622b000d99bfb56e1c76c433dc8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.605, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008926639623340282}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.37633333333333335, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258924}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6fdc9a2d38f657ff24a3cf406604a8eaa5c4c96
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099982269204863}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7065c809b9ef19ca86026a9a1a76de67008e1464
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5353333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009107405418833935}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5333333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00910991912725527}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..05aab1d746c353552df8250a435482bf3161d6d7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.5136666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009126819837938642}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.5043333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129888226428837}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f028c3a270991167c06b87e67f3a1d86bceb627
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.49266666666666664, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00912924906387328}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.48633333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009126819837938642}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..28a721644d79d00f8534686e0a0ac629e574fd4a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_after_reading_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.4836666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009125358337932443}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.47333333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00911723665908298}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5f00a44640d8d68e02dedc0f7e33d931fa19b8b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.6236666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008846558976258922}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.617, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00887674483503322}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab68a35f89afdd6ba33c71ca6e3c33a328c5092e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5426666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009096928229880423}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.542, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009097962646004976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2894edd6fd89a7d731f3c156b2444c05f6389cb1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5266666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009117236659082983}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.521, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009122174705469926}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0548cefcfb6f863ed36bc1d3849d91f7c467a046
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.515, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00912612159491215}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5076666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129157751283578}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..960fb2422789a128b54acc289ec230ece612286f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.506, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009129573723461864}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.495, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129774600800658}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..93c7fe01f8796137d26bda17b44769363d2ffc96
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_exercise_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.497, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009130066778130833}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.4846666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009125936876338593}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9c9b1fd1a9e46269b0c00a3c2bb335e1ff47d8b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.44733333333333336, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009079439381402937}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.37566666666666665, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008843442555522142}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5837020761c3135b75be01de628e0f8862e5c1e1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5413333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009098980657278165}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7117bfeed808ab869d6ac296c3a6a61fb1b2a30a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5446666666666666, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009093726495969151}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.536, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009106534814375938}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8353bdcad747a18ea577d51de538293747968eb9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5283333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009115560243539177}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5206666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009122428543456457}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ae8e4440061a31d4eda4c09f98a2b5a33545aff
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.512, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009127601238448371}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.496, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009129938951699211}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3a842b6da9cced2cc810621c1f571bfcb80b599
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_valid_binary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5033333333333333, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009130028227490719}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.4826666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009124743220028738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dac87d5c7fef28000bf331a40ce71cbc2c7ccfec
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6203333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008861873799148993}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6236666666666667, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008846558976258922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e7b30e0218fb934923d8bc00ff39464a921e70f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.541, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009099483512819305}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5406666666666666, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009099982269204863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6af9d3a145ac85c9743677203adf79835125f829
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.592, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008974343780026194}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.5943333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008966262991425923}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4b4ea1d3d98b12eea759eca3a16016811bbc6f1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6033333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008933122315228994}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6043333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008929245712536294}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..433d0a35ebf177b0a67100990ef7d6feffa0ed5d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.603, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008934405848700118}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.616, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008881119942353995}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ecca9876f18ea078ec461ab0e30d09cfd5f33a0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_boolq_yes_no_question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.6103333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008905164372580982}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.6143333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008888323636208593}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..af97997ccea0d3737e2982b986e27d6286c0bc17
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.3392857142857143, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06384226561930825}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.18571428571428572, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..881b0f363c217c233b663030a7ece636fa35f36a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.2976100628930818, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f39d211a262fba36c035bf9e1ee0d9b1a120b00
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.375, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06527912098338669}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.28595317725752506, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..861003573e0c098c808f916604d3a4c5bb69e9f1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.30057471264367813, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f9361ab0be144c475d992ed47625c298af5d9b7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.3228070175438597, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5aee18d19f9612c97e8a79e9b292b265de27a076
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "GPT-3 style", "acc": 0.4107142857142857, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06633634150359541}, {"task_name": "cb", "prompt_name": "GPT-3 style", "f1": 0.29455848810687524, "fixed_answer_choice_list": ["True", "False", "Neither"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "75db2bc2-3caa-4956-9653-13c7dd6255df", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0f2f29df42450fb7bee5d74ca22b31beaa6712d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813057}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2850877192982456, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..41c502ed9420b774abc5ad68dadf9590b5d6549b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..911bc60af86d26e53f3919627eb9351d44c7cc1a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.32222222222222224, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..75be0b5ee2cbc9284fcf6ac61a2fd0a78ea84732
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.42857142857142855, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06672848092813058}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.23986486486486489, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4defab24b98fcf8f276a294ce1fa41a0a5d8170e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.2660493827160494, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1371599bcaeb4602b8c13bb1d2a6ee88340d5a11
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "MNLI crowdsource", "acc": 0.5178571428571429, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06737697508644647}, {"task_name": "cb", "prompt_name": "MNLI crowdsource", "f1": 0.25267737617135205, "fixed_answer_choice_list": ["Correct", "Incorrect", "Inconclusive"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "bee62bfa-5307-4e1c-97b2-2ad2f7bcb179", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eabc39f5a37e2225793ea1ab552a4e63e4695c12
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.30278191753601597, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..adfd89607f4173ad4a6548b8d2388e7bc20531ad
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..49601318106890f8bc5286f8b74906c2cb159aaf
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06724777654937658}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3421052631578947, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f30132c721d3a361b3b37a6eb254b22e69a7f9a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3519445514054678, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3fdebead4fead8fa9113523a421068343d732a68
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.5, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06741998624632421}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3485871467866967, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd42ddab28d9f7bb494e353e9054d46996457c67
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_can-we-infer_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "can we infer", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "can we infer", "f1": 0.3333333333333333, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "2e76cd0f-68ca-4f03-83ed-11cf15b25a84", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %} ", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dd17d81201b512555ed3c5166bdb132631085e9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.14285714285714285, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0471841613625583}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.11942959001782531, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..580230bcb2320d6fb483d94d38bf180aadfeb0b3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32142857142857145, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0629736228905634}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.25043478260869567, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..585a7f64c7253cef20954352cb910965f1468342
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.30357142857142855, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06199938655510754}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2763645998940117, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a2812234fc2707a421ede45b094b207de8daf87
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.2857142857142857, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06091449038731725}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.23859649122807017, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6715ac9a8474bb7ed620773beaed532833468c25
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32142857142857145, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0629736228905634}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.2299145299145299, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..06e6ef35e4e0ce7e901e8a73618922d992021902
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "acc": 0.32142857142857145, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0629736228905634}, {"task_name": "cb", "prompt_name": "guaranteed/possible/impossible", "f1": 0.24952959907367203, "fixed_answer_choice_list": ["Guaranteed", "Impossible", "Possible"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "90ab1002-093c-4e54-b48f-626655e36b65", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..222e53d03374b5d5a2ff286818b69147ffff80d8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.28359788359788357, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b15791b5d01ac0c362fcc8d7ecdb0a433816379
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.39285714285714285, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0658538889806635}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.2842025699168556, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..167b8c304d702854384a3f34e1c2f645dc36399e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.06703189227942398}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.325725338491296, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cac669f77436f6e11100bb0a42900dac180fd61d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.44642857142857145, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.067031892279424}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3237591332829428, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7b9c81251532b92591c870910714500924423dd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.48214285714285715, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0673769750864465}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3404040404040403, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eba5d893eccf66b56ca4b9b70c92dad6c14523c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_cb_justified-in-saying_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "cb", "prompt_name": "justified in saying", "acc": 0.4642857142857143, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0672477765493766}, {"task_name": "cb", "prompt_name": "justified in saying", "f1": 0.3253272334477062, "fixed_answer_choice_list": ["Yes", "No", "Maybe"], "dataset_path": "super_glue", "dataset_name": "cb", "subset": null, "prompt_id": "5c9b1fa9-93f0-4f82-b9e3-e0967e4d7260", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {% if label !=-1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": ""}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..83b5db0a647a505d26f19a8615d2d8e89b3c7073
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.65, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.047937248544110196}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b853f500b9f71f688a1efea8ec601d3b120cb4aa
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.58, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..30d29d32f77e59967c2009c72423adeabaff0605
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.57, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9438531ad22626cc513c5350621f5a3c81ae13f0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..dba16c536ae02e36f6861979e0eb267a88e7ff19
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.54, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.05009082659620332}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbbc6ac7a9f7ee82ec666484c0eaa838482dd0c3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_best_option_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "best_option", "acc": 0.56, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "best_option", "acc_norm": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "8ce80f8a-239e-4393-892c-f63dbb0d9929", "prompt_jinja": "{{ premise }} \n\nWhat's the best option?\n- {{choice1}}\n- {{choice2}}\n\nWe are looking for {% if question == \"cause\" %} a cause {% else %} an effect {% endif %}\n||| {% if label != -1 %}{{answer_choices[label]}}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049999999999999996}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aebdf0de70bc9a49f655a7a7e3d0ffe05509ca0a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.62, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.048783173121456316}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a829cca29fba0bed17463285c2caaef9c69d1899
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049236596391733084}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..29c060fd986eb4e70b02c53d3551b084f851dc53
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbe52578410400694fe0ffc1eaa39e003c25c30
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04852365870939098}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04852365870939098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e86c2fa4d6e67242a83d66c7b3f98ac982984376
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2666c6d2e57da84b8f792e29499ffc9f1d6c7a2c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_cause_effect_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "cause_effect", "acc": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049756985195624284}, {"task_name": "copa", "prompt_name": "cause_effect", "acc_norm": 0.43, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "a61d8c21-da25-47bf-b5fe-14a8edd650af", "prompt_jinja": "{{ premise }}\n\nSelect the most plausible {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049756985195624284}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b3fa339f5774171743042ea3e1a49e80d65f05b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.59, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04943110704237102}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1292e32589db2f5c855e0f3ea0672e2b48d7bf0f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049999999999999996}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.049604496374885836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..943af1d85fed8eed82daec3774b34daeefe9d9a2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04852365870939098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0073086e9b2b15908289677cdb286b0c80d91509
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.048523658709390974}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.36, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04824181513244218}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e16923a341f93450d072bab2991e2d1b01c78052
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae65d5dc4820027782c5e529fa3bb104eb8ca697
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_choose_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "choose", "acc": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.048783173121456316}, {"task_name": "copa", "prompt_name": "choose", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "f32348cd-d3cb-4619-87b9-e24f99c78567", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} because... {% else %} so... {% endif %}\nChoose between:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048523658709390974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c8cb0ff56028a1955d9274b7631f7e713d72833
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.55, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04999999999999999}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.5, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.050251890762960605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea00488f7141b38212f21c848453a702ede0c81c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.44, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04988876515698589}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.45, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1bd6c9db3fc535d75208e9a4975d7ce0277a980
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3425820ad711e4c11e2d048580146dd3bb47d2ca
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2eebf2738a460a2f62acaf3fdfd762b76c10aa0d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.41, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049431107042371025}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fddf6a1ab55989d8901455df12bc12adc87d616a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "i_am_hesitating", "acc": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.048783173121456316}, {"task_name": "copa", "prompt_name": "i_am_hesitating", "acc_norm": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "4d879cbe-2fd7-424a-9d78-3f5200313fba", "prompt_jinja": "{{ premise }} \n\nI am hesitating between two options. Help me choose the more likely {% if question == \"cause\" %} cause: {% else %} effect: {% endif %}\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048783173121456316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..67a4fcc9aada2762b817f057f81e0343a5bce1b0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.6, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04923659639173309}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.51, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.05024183937956912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ffdd8562f6c41169d20aceb10d832b31ef08b4e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.47, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.050161355804659205}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04960449637488584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1ccb412e542122bebe8eeb1186b0c406f49f5e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.04902071300001974}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.37, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04852365870939098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..389d6847770f9c39fc74a9d9d8933b607bb7d7b6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.048783173121456316}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.35, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0479372485441102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a62ef0cc3c88b729aa64f8aa499dd9565ff7a1d4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.42, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049604496374885836}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.38, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.048783173121456316}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..93762f5993b09217e9c21372b10389e428eec11d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "copa", "prompt_name": "plausible_alternatives", "acc": 0.4, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.049236596391733084}, {"task_name": "copa", "prompt_name": "plausible_alternatives", "acc_norm": 0.39, "fixed_answer_choice_list": null, "dataset_path": "super_glue", "dataset_name": "copa", "subset": null, "prompt_id": "66ea075e-4d03-4a78-b1fa-9a5228cf0c9d", "prompt_jinja": "{{ premise }} {% if question == \"cause\" %} This happened because... {% else %} As a consequence... {% endif %}\nHelp me pick the more plausible option:\n- {{choice1}}\n- {{choice2}} ||| {% if label != -1 %}{{ answer_choices[label] }}{%endif%}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.04902071300001974}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d0afd6d27233b616b56a39cc4e5f7b063dba30d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 3.6336092621190903, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04963488024792009}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.43315596188440153, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020658767170531234}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4957166938261749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001866401578793264}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4535276083867647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016078121139671377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.1849292195431834, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011286695386874615}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.2168971083308747, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014173517714469036}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.19529164267883087, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010857096664964741}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.3234464772574325, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012239406876133328}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3815922019580204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001958691690626948}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.34319575393248214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012177223555536177}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.3387500414303267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018088042362394187}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3884662275886484, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017571209037241864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3549101277176399, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001508528726933971}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..feed79599a0b0817a3b628c7e6b8794c20ffec32
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 11.497966951726749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20969517538787533}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5632434121689073, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035151496739973145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4271980530284272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003193464490019439}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4604743115551077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002650137715023285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.2697550180919695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002821745940147497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.20055621808744717, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022326904923778744}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.21675097190154122, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002130665758231791}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4118615721075579, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031358370899496947}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.3089968540225795, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025812304907704814}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3340759170486623, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022699367889290594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.4622969291805104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003380992090862591}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3487092046094382, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002878950898546305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.3766682772648986, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002537216273666152}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..10070023f7cbbdfdf8d47902eab2b4b5970821d4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 13.46002251789312, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.17779419634920132}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5942518719864448, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032186316753523097}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4643183570824007, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029744455358002987}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.4956710429402688, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002287948952720807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.296024603420829, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002794307645016215}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.22746918536800087, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002276601310067811}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.24294338474194113, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002102733770788168}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4375809918999564, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030519076981421716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.33805697661700573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002492306655427877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3620142334272749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002119015868595214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.49322571419356365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032440445435041525}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.38313830841766866, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027640726437621606}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.40982719923359884, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00233324427012507}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..772f6cc29c46128d8dd4a6d31abb5b0886021c9d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.228668386742754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15741694271607098}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.5984788690378352, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003197560184234838}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4725742646775533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029898036325022134}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.5030401775192643, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022871900658989718}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.3022078979155364, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002768059681464576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.23634888463371143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002364216942607757}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.25125664699332345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021594431913264894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4402691357265774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030458831253925003}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.34438265485639696, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025298846029330625}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.36768240781122236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021719131443325925}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.49682458630116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003197442546788019}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.391344468013188, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028109879310869203}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.4169702193831535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00236830852153733}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c67e0be78a497fc7a48d1efbf4a2dbc4cac5e0a7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 14.597195611135092, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19311940839672206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.6033209977447119, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032262604730037233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.4748762815970977, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002945258001815744}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.5067540268822891, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002283857469804599}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.30663914951234, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028252371140640954}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.23823746120517744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023749274600023823}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.2543335770489735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00220496267618016}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.4445418586714487, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003042523823125125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.34796937168289555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002576869804389209}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.3718070181457613, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022081793332601567}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.5009193423101975, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032015130321828705}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.3942571507375127, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028270235680156602}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.42072079174732374, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002392828914020526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ac6191e44c1e2d698261fbaef476acb4eea23f7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 1.4830980923039687, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.021118997157981167}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.16033400703017334, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026324818883430674}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.26969890616718895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00300612512619589}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.19411260589173338, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026931921934754146}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.04108732076700223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013148092674986009}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.060179100405235525, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018083443739043902}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.0471164796345766, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014453353118279637}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.13141266843169974, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016840231738956485}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.23233998433211528, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002079592569134541}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.1623047778451774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001733234289363297}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.12413304341622819, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002193995817408237}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.20741850668045922, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025905538539559496}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.1497420581979396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002264568864352182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..61863b2f41848bfed2f4985977846f0ceb4032ee
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 11.052875734645225, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14880854986446493}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.57014937693693, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034678949077726964}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.42698229782553804, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003175445550377141}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.46240711895501474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026000155597878864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2687730297236951, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027746644235063494}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.1978323547353253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022114672007333223}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.21455453622601164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020999880904874935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.41381444937572104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003104864621244597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.30576495763203765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025267141814989897}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.33247489589579843, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002211720556331402}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.46455549393143064, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033363704023788546}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3457433020096904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028302467398599804}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.3752913090337666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024768066693497694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b43a29d1b85f2853b0c3035ea3e840cc2d02b08b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 12.991357667842433, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2148940573425749}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5896158988648285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003227466696309816}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.45591912216432784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002945187087829095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.48878419234716897, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002280346300880379}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.291483506564891, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028136818980263323}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.22065139023543603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00223563856964428}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.23698600371386896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002081490451850364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.43257854015901653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030534619486731036}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3305654008256819, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024648687370697397}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.3555024478259199, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020958132366617455}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4864701775008372, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032498731210847303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3735813004719031, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027186299763853495}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.4013706220535876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002300602644643578}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dd6da97f38525a449c903119e5de8b58acfde17
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 13.762369274263193, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1239176716959734}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.59429608818623, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032250742001364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.4665961665554913, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002964182169864703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.4972877840918587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022666769876385283}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.29729315956239166, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002800668893654478}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2299667816072873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023293965476642994}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2450997873827247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021372854101083423}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.43693224172608097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030744702439597735}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3396715433163892, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025008213475653076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.36309320543932994, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002143455970205985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.49167275767919666, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003227991912008724}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.38503302211525176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027832705857527332}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.41071083444612827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002345494497419325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..74e3d2e35b4b69c28df346d3b0815f506ab474af
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 14.428600064889515, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15433340933184303}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.5943537947777199, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031999293554169036}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.47228487122340007, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002956708038848888}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.5025477379055636, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023113153921818035}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.2998475825029556, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002791055852445239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2352088394682515, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002377332758985399}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.2504150328382754, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022100011211586643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.43594043199679045, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029783466717854096}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.3443040962007768, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002545103701911904}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.366995310341045, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021887174581775594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.4931294642744763, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003159722931878829}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.3914753200381526, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002813609189039384}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.41671475645067774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023914867754907575}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d47599e87f9601359510ccd55d4755d5ba7a056
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 0.0005117932625447547, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.536197427966122e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.07240788007030449, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004022247456753345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.018253220521184315, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009033239887635698}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.022954814700845268, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001068325564862084}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.010383184176156, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008601583355918767}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.005509807793947819, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003484284415338221}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.006565521451443904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00039997706182641485}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.07151594361458591, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003994235701988043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.0178020630508808, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008766434342585233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.02242271358700061, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001039923095268228}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.06627554096777186, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0039461603525234356}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.014308412178016428, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007253437526843341}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.01827124494666003, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008646161286213295}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bb57c13665d1980d6f1c3e114d96643eb43add9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 4.721645081870147, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11055575784993}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.2724718400705274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004738978458220745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.22483884027045697, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003941300807919795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.22193867861495062, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036201064747419113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.1110669378591428, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002415691980156218}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.0998953071293873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021186084361931204}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.09765121722587221, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001961865183877688}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.21699291071030072, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003963831108261732}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.17178828430922233, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029026558236146843}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.17063146146283567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0026520595762520027}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.23219225461340648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004261782115439799}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.18734037977237278, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003306476925594047}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.18546274919682199, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030526937038671213}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b7b78ea9236cacf0d7f85faec31ec3111c82644
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 7.330361926611788, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1393713297511099}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.397688884843428, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005243232044302793}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.28953345540286196, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004270611999067108}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.29951823566096936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003943331079304518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.19106018811512912, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003494466457298735}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1386335472956288, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002514572800352678}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.1420081261904641, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002358304787932758}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.30864554668132316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004308343244325185}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.21603676807123937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031178212713556404}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.22492458878541202, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002857976437028613}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.33293896422770386, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004658899774399506}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2396948856097552, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036140816121463643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.24818144922627508, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003358374241363087}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf1903b400d8e70b4492b4216dec8bbeb7f4b389
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 8.582824417171965, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15758544085957477}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.44237363313007766, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005197770565781296}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3256644422872396, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043344718084630095}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.33914919408530725, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00398888812190965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.2164451576020569, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0035403319127956744}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.15653074707623968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025939385453826597}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.16128257662655165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002432579571236265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.3325732643672117, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00412359018355362}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.23863001496343497, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003169633829427399}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.24925358955339996, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028737162375128585}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.36169334864492975, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004501482660071617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2656809328179925, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036603625436697033}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.27632455232693603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003375878671989277}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b713daa3882eb39ceffb36f5347cde829c55db37
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 10.418299373383455, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.143116835758759}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.47588547581116447, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005023552603812295}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.3592386557163939, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004267887970496025}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.37529427535129467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003972066091031049}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.2368115010419087, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0035477130748788645}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1755184404072541, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026501024504753078}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.18235846045485526, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0025546984319096866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.35325726671965807, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004022741316670696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.2608057046678188, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003158960957102914}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.2731858449063431, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0029261481157958646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.3918934290122221, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004437483351553158}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.29433296219990573, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003646588910109909}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.3077556604700645, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034362468382048103}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..98327ce9b285638727df24b5ccaf35cf9e4dde6b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 3.1932091730586953, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12204583279523666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.41337918705325344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0052760059693995}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.2309238228257254, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032774148724919335}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.24046689564492205, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026584396733124543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.23463707329236097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0062472430372265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.07811161999290202, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001551040325003802}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.08042598706024275, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001263671869367697}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.3599428455550758, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005381765248981534}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.179931821205033, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002310618989620828}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.18994590486023305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016519612628498022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.39508222499421797, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005341940324256954}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.21269354787389044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003034851683547011}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2231487737804936, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024418581547524815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..37ba2721bd7e9b2e9fc48d07729f1ed008cafeac
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 12.375900606698423, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19606823444326352}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.597454575764657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032329061443020345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4434643604110928, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029889149083277117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.48293554827595836, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023435879912405283}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2910060712314691, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027476178598673066}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21176179817940102, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021589328308703566}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.23118450232385498, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020415023824907675}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.43888436474875336, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030457086517011057}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3220136107735307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024545810076108667}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3518994031108773, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021201781748616372}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.4908609045585646, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003223896911814539}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3626278022836946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027433883081054507}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.39555654566655024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023470162024293896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c225e8bf1524fc8722e90c0c04643e25cba86a67
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 14.369325725727359, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16247647633686452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.6089651864273457, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003198435500725766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4684512148933178, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029309075224427496}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5050896808334919, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022820908410676275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.31006692351264853, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002823237008286986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23489561514574075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022593518873055855}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.25360971138111243, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021297710933538033}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4528063450828851, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030647195241319818}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34593233406974633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025245059893910897}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3737642937216708, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002190131174828545}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.5104756682075146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032393352108801977}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39199556131381524, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027868585721285742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.4229068329404585, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023792336336982954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..768cf59b5cae58304a5a283691f6153d5e8fce50
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.229236548887188, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1893298480282073}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.6019638812498068, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003154041355931588}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4769354854853059, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002958348996182155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5094328850058195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023113991646470547}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.30884118560646173, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002746739827702377}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.24294392646302612, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002367727486010716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2589859331129998, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021744147637838404}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4461308937388599, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029602114104730864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3521903496319005, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002557452362321216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3764193148894467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002186561194132359}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.5062370391801149, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003164925596063054}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4014335787816741, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002844420919383205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.428648276943535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00241888010522652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4bae70c04559312fdd43b385fea34bf24e41dec
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 15.5976388656912, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21552554445196853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.5995295914989719, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003160879006436628}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.48298984835020653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002984598478781118}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.5121277417214488, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023435839886818115}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.30839025650068297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002750823115926885}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.24686659194192842, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024081433509198598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2613413893640116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022244910107010367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.4436724332909479, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029870463284766544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35568244613898303, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026122906695223214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.37763811148528253, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002256869338791484}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.5052162393739371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031736753796748323}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4072510388482175, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029018213850476}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.43177980537098043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024732350477868438}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2570466cd424387a345cb5361499d4b53c2a04ef
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 3.5945698535151527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05015613229197462}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.1518335745611247, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018826786001323694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.3204231881776139, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003749517143648088}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.2014677219582868, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00236715942636543}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.06474667748021277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000965958278788391}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.14330865656477818, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021929123057498187}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.087018168845551, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012616678095828638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.13008693019278264, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015822109164815863}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.27854183494329476, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003339816956023396}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.17348222244786518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020292638348797455}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.13205681807979777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016948884637677832}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.27959042918923643, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034135086071623226}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.17531663258612987, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021346250443011615}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dd5c51dcdceab40daf66a6fefe3a0903de0b8b8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 11.965870070374333, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16483121107829488}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.5887118747238361, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034037823001644547}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.42863531404813354, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002945871261238008}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.46931614928272436, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023464967015723428}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.2917732877577307, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029515576898139703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.20650048693459347, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002171478739145782}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.22727215524670805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020921315305935666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.43843638462977585, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003267387355090894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.31435132059874576, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002449056457480189}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.34594037793676896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021730860064039063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.4863382057285638, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003427467174859405}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3514705730796979, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002706234060494214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.3858219923088528, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023631470234993725}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f84cf00b4ab45258487cac46e1acb82664f5e650
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.21968762329751, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19753671493984845}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.6001939642148559, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032889319145619654}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.45215722065445574, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029376225460110163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4896336928218898, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002272681148009135}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.3042584905990978, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029124035723080053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22448908642607843, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022714538946666265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.24356704396149256, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021235805300181703}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.44603166342337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031685568327235944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3328512396680296, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025185492257657357}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.36147096558421893, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021784713573052518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.49791643563111876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003299490210633222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.37373958585789946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002758682539439094}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.405169085603143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002336476952595647}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..65530fda1a4bfe024bbf99ab2a8bad3773c510a3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 13.762757946910973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15395787830101001}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.6011962101235646, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003243407906957494}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.4566959902853601, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002939867346407233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.49443903206008083, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002288175378374436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.30545692484615405, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002828597032420292}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.22960389174017176, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023569729423993362}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.24825098805394827, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002175994450385002}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.4482976384673195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0030983880866779887}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3382220596725317, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025420156167704428}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3668711184434305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021942133007791824}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.49895735281420034, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003239442063627894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.3789157003201587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027965890193868393}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.4101796683834667, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002376314535157212}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dbe595a83314070b74bf4c1699ecb3bfc098d07
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 14.08932220330059, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12549599830246483}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.6032747175760897, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032330932608095417}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.46029173412458885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002961686972053866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4977678289874263, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002315197828946516}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.30785136134317653, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002851909271826754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.23142634215783778, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023233772081180178}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.250383581684577, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021703487973632127}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.44843896379060744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00309459957521781}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.3397342484556464, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00254503160734854}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.3680698318578711, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002203713649900909}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.5007987352925727, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032114979248749087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.38188336871065476, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002800647830926674}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.41293672055666814, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023792263326761756}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e4ac709ded5c12d4b5d985094818ecbfba4617e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.1071307957238157, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016517097059310102}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.26802963730298424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0038166991596965635}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.15126799942015656, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022274714202831503}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.016455460229799202, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007123063350781688}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.04241402980618847, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001839872857682253}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.02341459296206923, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010037294271937876}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0851420400256054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011652272980718196}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.214378710831144, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027666678497373413}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.12042858816848816, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015747379656493184}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08583240847633615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013027504044229373}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.21687477112312012, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00316229908245589}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12150553920325693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017784859933383196}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.7808151339872366, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09382381387908688}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ce789a60fd52969344433e9eeec65f3901ff119
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.11231051130110176, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018725155871750657}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.1795623731554383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033273148415623524}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.12864627025251077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019671741638819457}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.007601036529870395, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005382965250024377}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.01463614027423228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011353174654812333}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.009363085787250576, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000657318457701902}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.08529490041926313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014199778984168093}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.13479472021332767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023922057175442494}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.09703185675955185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014143082322830222}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.08981383040850886, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014661369138302548}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.14526601892808536, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027443402153164392}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.10322737003200926, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015575341100461024}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.44746058261891863, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04604614675806431}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..314539093a0390b43bb7f98194a08678008af917
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.16788705210345736, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035631862700196597}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.1859715150205194, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003718461517457391}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.1630833669947709, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029013541077108885}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.027280423968171305, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017884680404872415}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.02861726878304468, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016227282457088239}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.02516880091161777, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014518773666627352}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.1298654453376493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027792255251731595}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.14241015540538696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002737234265271257}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.125424737042503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021634801655348837}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.1317729164353648, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027919825527681222}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.14675010657719567, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030004020311325185}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.12801618488485492, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002241267571048609}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 1.2338766623936686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0863575218315581}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c68abd4d4e18f1cdaef14a71d038f44ced823587
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.20259935374214025, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004297260212942113}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.19902344092534727, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0040898284395801425}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.18680534135299007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035041138182561496}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.04047153603808275, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0022111947670748746}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.03870933901712769, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001945297453974642}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.03634684076285987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018252272028769054}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.15363058299522142, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033796539988497055}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.14959751674894153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0030580637340826234}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.14103166922609478, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027058623767703674}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.15548750806102077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033775890032156424}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.15343744602601786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003228331954516354}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.14344562299193356, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027398053676387112}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 2.112211471327635, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14836531376817008}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..21004efb74a65ba36536bde352d628036adb9483
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.06203252815051904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004046315146705861}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.05167378825092661, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033958009117904748}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.051670093376308644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00325478898460509}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.01609711772190667, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018101320874714744}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.012532422565499546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012529251267989366}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.012790849399041708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012580759880557381}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.04843983623971503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033335322774442353}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.03883146936809033, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025763740768868872}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.039365007752481786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025465835991996433}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.048932832240163246, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033502554817490303}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.039682203568842785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002636894081082925}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.039952964885033464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002574589625833568}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.08776375172639635, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0254874930206879}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..742999bfba352074c467c11cb08f9527a9e93bb2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001212355466087544}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 7.94438927507448e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 5.667175161318954e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.00015173505739543477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00010815684401450718}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001212355466087544}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 7.94438927507448e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 5.667175161318954e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.00015173505739543477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00010815684401450718}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.0017152658662092624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001212355466087544}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 7.94438927507448e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 5.667175161318954e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.00015173505739543477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00010815684401450718}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..23cc73aabb459da070f1f1e57fae903f2ab9fc1e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.149753853048109, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002171727132004842}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.33575386280850533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004569585612811194}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.20327156203276772, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002713519692487557}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0373304252845041, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013140461748437202}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.08667444780169199, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029020162558933056}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.05115437745821428, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016948540468750808}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.11684544594863705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017379797448369813}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2626066143615487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035996707108239072}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.15860444913174343, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021216399101338006}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.11959497456732406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019073480015924227}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.26906161142003515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004008001098551392}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.16243069564873555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023749553691038666}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 2.0038553875789904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0955856896174526}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..27e7cb1e1ae02ec0f19960bb9a7ac6df82ebda14
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.17568310283334154, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003154566926090178}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.2735517968470343, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004496187884537725}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.1978014293145674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029030615223376317}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.03589501457929329, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015153906885206705}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.05948047287527727, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002405412603446267}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.04146006996703184, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016153060489392866}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.13012656454111257, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002376680785916719}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.2033977431475975, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034053368336631642}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.14648095189504623, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021711694446675286}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.13565628042823968, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002405886992298969}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.2159357251685773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003778783915859853}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.15404254885643145, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023143063162019595}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 1.8050866450785243, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10998069507650922}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..72513823b7e52045ce08ecc562adbb9220da55de
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.2771488326124715, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004467706670226065}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.25485033292705744, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003967723186833506}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.2502568815425181, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035630016377887442}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.07229011551366707, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002865914324384087}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.06603510981541624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025917576128546044}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.06491562652621455, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024653642015722986}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.21436486172237043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037773047502249776}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.19611922774785026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0032881375175458053}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.1929431469393036, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0030168487926143113}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.215602404940091, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037769923822126854}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.19818717656448856, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003371545970704401}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.19440288733566372, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030350725691875064}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.37468537370651, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.23333263171377386}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c30cda4d30abc8f605bc45f7b339eb625c58709a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.27792934145157255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004849829964193726}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.23766382636060715, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003915272947542149}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.244573711894895, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003835454456407674}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.07564028912702611, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0031166559518488166}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.06284434238677651, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024326471023516597}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.06539576616966959, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002549322979655533}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.21520151898487322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004159378136303307}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.1832103735419546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003250379217526897}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.18891811330851163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0032841796559050377}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.2158020371347959, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004149734653819935}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.18401121788190536, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003258676196585185}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.1895940498397185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003283363186227225}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 3.440764901759586, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1784268606441582}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d9262e1ecfe638d921921917ee559fe724655a9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.07246383175431523, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00469786089881535}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.05629931108498489, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003640521050340188}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.058787434685230615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036645533335454812}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.01903215203677445, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018590330603459604}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.015828376332120486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015323609301644817}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.016318581481069933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015664988644061195}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.05759037860265505, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00393018249775178}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.043864628765744866, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0028934618336297607}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.045917438807934134, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002931975565773022}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.058060643012514464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003941612774228265}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.04443074379368858, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002927987421647252}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.046389176010339, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029498199662157673}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.07086985461456602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02998805114415839}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..23b444dae37aa08a2d5a014304daf3e38f43ad01
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f1e45dd433776f91d37dc788dff7b1ae7197a0a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13999986827582567, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022474794503698063}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3146848228074294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004975343003506539}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.19062853936311658, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029532646040511964}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03163976089485668, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011861306107240704}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.07306747424744006, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002675800285732193}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04328828501096498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001566248398707883}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10267416140662887, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016806410262001903}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.23218112607700236, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003798988027549183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.139911595168302, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021985427116768684}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11113494233445374, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018618965914790858}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2517399727563149, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004226741179316262}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.15161941662067227, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024594718320593506}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7872267963517905, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10045358377651255}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8fbaed170485455ceeb3e9021c4764a7abefcd9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.198389498815838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036200528310182637}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.27297271430803394, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004198789938626886}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21134911198456563, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003029508784148872}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.043045220973661015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018919242792135235}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0597065055598483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023731544903471393}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04534620252044498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017858055311589297}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.15080384564385396, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028920115769217117}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2086058984013953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033970509765821025}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16064653466424578, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024169691348135945}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.15275988244015698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028982643948039046}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.21320242569675277, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035810539802842534}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1633349245997364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002468028232419068}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.0535765112268263, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11972597685190434}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed7bd306dd0e93774be74583419e9ad34a1508dd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.2607761013228957, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004351702830837392}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2607885042839857, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004018215060642568}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2444857696168434, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034588270505411917}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.06344356630530605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026820157377482148}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06209709779478734, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00246191601941908}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05852877798479666, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023072565365240597}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.19907633090246687, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036157515137942723}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.19814987180436297, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003287204314003516}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.18609020665428283, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002889852915757798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.20074551849299913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036053665449958016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.20155142761693595, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003414189551327218}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.18820566058762211, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002901279776307625}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.0737243948901085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20707696514444188}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..329dcec201ac7c9d127b712eb1ce9d12c70b768a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.26517071973676093, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004671054222681604}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.24180770628858178, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003940301667285207}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.23929443585701224, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037168757909286886}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.06573984745309881, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027796960958526426}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.05833153123977987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023156387603366626}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05849869572638163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023240414297435643}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.203598693888488, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0039976969321419975}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18373116025857217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003225733424944883}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1821366529554913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0030775956513009466}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.20485233666950228, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003995574052859589}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.1854022097149016, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003252182345953945}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1835081526249607, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003085655347849612}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.3049815114104333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.159904818084904}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c83f74441000ff7e4b35ce966b535bcd2a36bd40
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07320382597863412, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004382339503654509}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06154298544226265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037288317198754368}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.06274888299344217, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00368347747261438}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.018885269050898535, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018067685701354235}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.016055891153780352, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015325179921096511}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01636894923247962, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015408484981787023}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.057742538854948004, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003573097680574354}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0479152284127263, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029805271711328637}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04906837355261336, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002968163725283441}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05793234813776215, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035819297580720103}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04803187451993844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029874284500312834}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04921620084471268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029754368968071495}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.14790698979130756, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04384286085154373}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..342cce32817e6fbcb1288ee24acb72e00f2d88d5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002313901471635568, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006326582061365616}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0020282790639805184, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005520696211086723}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0021324501644318945, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005815454829014453}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0004152380160032476, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00018397799224345848}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00038092195139919884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00017352719125781168}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0003960521179313103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00017805195394425088}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0019597074784293666, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005429844277966419}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0017448307905115785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00048409378403815574}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0018203546217228452, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005047529402091446}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0020637585328133834, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005760004715550667}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0018460791228919861, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005185190358660706}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0019229642762192925, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005389418518153393}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.0193423208210636e-40, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.482688743257922e-35}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..28525cd6b41b44bd0bd9860f998b47cd01cf696a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.14012944937421437, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001815307424647061}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.33928055295031845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004205581767652354}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.19588864333192735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024295469670554144}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.028568538737182608, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010066612852665242}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0723006950405151, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025924457641700845}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04039396274671048, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014127466271823398}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1022109965196248, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012997576864037014}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.24906820252984618, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003169806335614786}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.14305945497627331, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017548887014810013}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.11145031743937453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014910950299529468}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.27157007058633853, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036049451066097176}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.15603837919866542, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020168926814176018}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.535130559814857, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08809834580896415}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a07d03e4b51608eef0a4b10ef6597d1ab5f92a25
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.1950575300209308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036023058150002466}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.2922289142961424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004398147986435147}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.21339819130609702, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002983888004350639}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.04216057616807481, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018970859687410643}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06585825426835108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002477991546098299}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.04676534492086212, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018057298964850564}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.1465833707685466, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002875377511839599}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.2195610151275998, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033993217008700853}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.15996882237844728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0023436116694455617}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.15094912938206448, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028937977529524183}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.22997442326868872, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003759859465696649}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.16604704688504865, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024674421504619097}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 1.9675951573060544, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10694227812444064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d7213ba8316f05b33cf7038f4ad5d93709eb879
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.2638394192805916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004540435287238375}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.2738037170319721, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004200587916428268}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.24732615209837655, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034733941519721734}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.07001609054360647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029271091177120886}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06909930052758312, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025744304924619835}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.06386379302307044, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024181690250233505}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.20450354273425303, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00395614496467891}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.20875565805339058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003359958014590378}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1900422633935571, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0030029862330431846}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.2073880639894331, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003930522367662495}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.2151833283968645, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036280670247936853}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.19389966865847785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003031622262926847}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 3.0346172155561546, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15566119939690068}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..70942e91c131c2098947a3dfc5b46c97426d5c71
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.26034346215610965, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004714473900591052}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.24906248646319382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00415456465298038}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.23780963877739292, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037395882283707015}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.06660770770868331, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0028822154036991523}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.06179338888003384, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023891674540167077}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.05977234078723167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023701821865365984}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.20086262918393394, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004038583972504473}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.18959349271627773, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033254851364105283}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.1819474141212443, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031519157034204308}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.20259177495104444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00403113378580041}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.19316090690700008, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003503292832588365}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.18412012740680855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003175557497433766}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 3.425765641925955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1979335091537913}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5c7b07c65872fd0230f51a66da0b9f59965e921
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.07114285672279302, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004380915271764054}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.061764569030446005, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037292099495240716}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.06088055060364192, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003575574283828377}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.01884955198069114, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002033397598483995}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.015445509919559106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014952091434256999}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.015399984554066971, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015076205121051984}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.05564721298035322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0035475780203014515}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.047419217528694754, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002898877332795932}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.04700874906652635, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028312951452581186}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.056405015194697566, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035709061865885116}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.04839142944384069, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029806554750605547}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.04779427597038301, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028660149390703524}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 0.2087867261327827, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06562705889970392}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4f05e6534adae95a72831846c81056a2ab6e64a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0023440923719853565, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006865168187194257}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0018566841831715792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005180019415016593}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0020309670290739046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005791928983528292}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.0001917762530970078, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00011735823792007502}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.00015886629094176265, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 9.83557528468589e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.00017372564542375864, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00010697841454925956}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.0018456793659054301, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005204179651067365}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.0014866978138197782, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004055688639589803}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.0016117708575998733, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00044548916776186037}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.0018005407904788703, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004985423695356434}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.0014571242644023771, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0003936140639568936}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.0015760361520538472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004295342382164876}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 5.108639972341924e-39, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.956752962357288e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7b55d15651c12994da80f828dde7466906e73e1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.1474899344681375, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001900079131590382}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.3534023285854842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0042808487059165885}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.20565685899580208, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025302238335519456}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.03327897686130466, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010857124576225867}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.08257504981830704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0027006296618498618}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04683864411040331, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015138269812197199}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.11026909240204424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013894987564095334}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.26619441069312516, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003270460032424841}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.15403195663828284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001865355995836426}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.1173973677189778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001580655687968673}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.283327063520947, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003708823027951565}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16403204921526296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002134168071742254}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.8347869480000898, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0869883864663994}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..21c82e3a783cdb21b30d9d77fb27dad1595d4e33
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.19566499021136027, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0035669769380662765}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.27331585723108776, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004448638835604697}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.21005056026385502, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031333342568339744}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.04186277589901304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019629776118442544}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.058998681138224096, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024069269140667474}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.04475960677022685, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018872319158043016}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.14767848173331127, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029216531952624223}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2044653904041559, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033809171614435102}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.1575307148373402, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024757681091360194}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.15272574897907007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002932780684799666}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2157182395445939, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003746152427089657}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.16437594892952237, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002587213479835073}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 1.938071033295324, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09541184002040398}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..667548adb8410c5773826b1d7384399e516e7004
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.2506617619333507, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004279548367868259}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.2771185146802034, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004038617755545862}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.2448482219235882, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034178077943435277}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.06019009652828162, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00257094644990412}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06367536344844839, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0024323779221596563}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.057573246775443634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022648000197355543}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.18921156917814427, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003569423509420866}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.2072997416186527, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003206717340734459}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.18384384732035777, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028420337598510335}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.19253397732901711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035416958117886277}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.2142839788487451, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034386239695474597}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.1881941197455998, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028677725202235333}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 2.775277917992019, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16055336437381154}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c63e387d5269663ca92815b0c1af8d1dca8188a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.2490418965952657, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00477783797042605}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.2514874020658644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004242277195358792}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.2328552562831893, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037312986124307416}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.06439914737784772, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002947420000073468}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.06084688838603643, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0023670005477312694}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.05770154905937547, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002284739031825248}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.1902696505668648, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004048113449529588}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.189296062302524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033439414911562394}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.17641797157296718, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003071228472405363}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.19325864739656615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004019456619493295}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.19548749971046633, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035310682553972364}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.18026475633170685, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00308320691761892}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 3.0847462253212723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.2223901385109968}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa853c6a94f0e076983001415e2c1c2349c90712
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.06149614061062767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004002014655670965}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.05939632565343678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00385700196779865}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.055469206946848856, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034842031118360157}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.016873072153203366, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001726552437885677}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.015979494842853718, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001568928538748884}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.015079805978761029, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014697982090914057}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.04847125659743045, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003240222188539927}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.046087232274086315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003078628538409018}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.043274129761389696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00280337313824995}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.0491128410938864, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003254043057422175}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.04736823871678665, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031650848308666943}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0441116426489953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028366835005017984}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.1693742801410804, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04377385517948794}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..87e11e3873f47182e5b99c62c0860bc3903800d4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.0035734705546026303, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001375063651136652}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0005441463597927607, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00021036208677550778}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0009197860202658167, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0003511163847714274}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0004288164665523156, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004288164665523165}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 2.858776443682104e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 2.8587764436821168e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 5.360205831903945e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.3602058319039565e-05}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.0032875929102344197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012833920426189107}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0004905443014737213, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00018887263781695456}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0008295088694126976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003145590430095732}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.0032875929102344197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012833920426189107}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0004905443014737213, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00018887263781695456}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0008295088694126976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003145590430095732}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7aa99105d723d9aa109f595360309a3d522e516d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 6.278734834615668, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.2227369064421852}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.08160371186471228, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.002374549489609363}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7004950818659318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007036802366632382}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.13373991461055282, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0030603070260723855}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.06495513323593309, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0021769496767186734}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.547686255496274, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008191322897272223}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.10747014085856893, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0030428865011832523}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.08082169564914758, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0023411236390164757}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6966925221944833, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007073853726693974}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.1327187453488244, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.003043515110100461}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.07932773548315011, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002353587483256341}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6812084956321673, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.00725244402825127}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.1299744874996082, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0030517622714577117}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb3d3e9b7c88e3677a0951edde0c6bf88cfcba4e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 60.18185050682653, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.1724481335740067}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.6786023985669906, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006879636404319519}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.659172864309111, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.007228925418519524}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6518546551360388, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.007116978768503563}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.53388497210983, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.008002884678287275}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5268639680421803, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.008159634832380563}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5204293645702318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.008052025950724288}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6623660397033849, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0070499025881858735}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6470537171924869, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.007432342342313776}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6391270001438089, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.007323388651758978}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6652433097208901, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.007031373219079611}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6488361635192536, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.007399980576394991}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.641084851380832, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0072905490236244595}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2921e99df83729509bea7cb008f1ec5df39c8b40
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 64.64005573569813, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.1125840968414413}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7115721847853325, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.006383049267455568}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.6898981524653733, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006769920097945114}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.6887808582968012, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006614099300805803}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.568469754070017, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0077553126930507785}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.556983036110333, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007909142169515245}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5558632920359011, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0078092241499811525}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.6970976003322448, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006591012707429238}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6783489276320744, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.00698850669069673}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6768058280329439, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006840870337470356}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.6995156426369364, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006561376900903898}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.6801204068768889, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006951032244743205}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6786509336030443, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.00680233159395632}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce04c01fd5df39887ac6a9746930513f6a49fe53
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 68.22165305419762, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.8890448430097019}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7221627149025683, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0062305282040743}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7108029019292199, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0064897327358770865}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7073112938146661, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0063804858035251604}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5820508960354065, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0076299576959030615}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5773033129345385, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0077930736641026125}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5744304096662205, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007699595228276496}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7087419186042014, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006458269024355693}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.6993109497667697, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006723713657261213}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.6956339607612617, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0066200787187141895}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.711133750119118, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006421232852010984}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7011411370750749, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006681507675650779}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.6975066700910248, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0065771947909297095}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5355c7dcf567e0bcb617598e284aa1a8cc141b06
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 69.84524760921622, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.3023157448019556}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7345229851633986, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005986767051198255}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7253509489635235, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006227746850689511}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7217534759948284, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006121513376796184}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.596040626875803, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007501433138942847}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5915933328294807, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007623825675456275}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5886766303319366, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007540513357647879}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7223498063566517, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006229876667011842}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7145458072279548, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006462844404701978}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7108619060602632, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.006364463104987073}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7243706723600197, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.006187955065543882}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.716189446557702, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006421987983352356}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.712522166886964, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006322893899736439}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..670fb282a506b924bd37a497500713744209ebac
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 71.08648151603752, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 1.0423319434802825}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.7364977609872968, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.005944810805693522}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7306819844992705, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.006157866649224208}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.7261239196945068, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006051448617490669}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.5975955689445211, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007475842785462623}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5967441077322465, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007579782597891848}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.5928294767677813, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.007504735496121762}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.7246335948521524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.006182415774653073}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7201906425984315, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.006395020962183312}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.7154921729488967, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.00629239525683029}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.7265795029105262, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0061418478208074}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.7218218633881486, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006352649666067237}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.7171276634610969, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0062501289605129545}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd4b6f329c69e7bc0dece238ca13abd98531e7b5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.4896626768226333, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663330673075898}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.4896626768226333, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663330673075898}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..74d0ca09e56f1b9d8b31fabe5d7dbbf602b26afd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665713661738877}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5021762785636561, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665713661738877}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c74b7dffca637b7b55207e2e70ed65a9e5163be0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5201305767138193, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011656365410780372}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5201305767138193, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011656365410780372}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ea2771b966436aabb97799133cc256b01193794
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5244831338411317, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011651830225709979}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5244831338411317, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011651830225709979}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b45c28c9b72b032d2395086737cf1f1a76f733e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5081610446137106, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664270112244237}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5081610446137106, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664270112244237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ecb9593171d93bddc6671340ef37bdfbb1d9997
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc": 0.5212187159956474, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011655314732288861}, {"task_name": "piqa", "prompt_name": "choose the most appropriate solution", "acc_norm": 0.5212187159956474, "fixed_answer_choice_list": ["Solution 1", "Solution 2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "adfef248-f856-44fa-affd-e3223718854e", "prompt_jinja": "Given a goal and 2 solutions, choose the most appropriate solution.\nGoal: {{goal}}\n- {{\"Solution 1\"}}: {{sol1}}\n- {{\"Solution 2\"}}: {{sol2}}\n\nAnswer by returning either {{\"Solution 1\"}} or {{\"Solution 2\"}}\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011655314732288861}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..782f0733ab6b2ed3c3bb2a31a34ed48a91041170
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.17066898481393913, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.00803261594895346}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.019988781791314362, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.000500033258601662}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.2189943530475805, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.004079283289986538}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.03478661571339285, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0008014750154673911}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0031299626827861727, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00015456359119012437}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.038130006762105165, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.001999093625706957}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.0055069416857675855, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.000264484767130222}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.018169764912390914, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0004309343684239099}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.20275069372478832, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0037184478182369536}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.0316936420027784, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006939152149686633}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.01648814395051073, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.00041116448513992096}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.18839597007095757, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0037196421404839127}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.028758460694468576, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006600358478717193}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b7a7ae46914088fd74eda6cc5effbe26c727bcd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.31915048013706, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.02620039827296871}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.08315699283905327, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003419242714843206}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.1670935715647722, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003999059624587211}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07886064580447584, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0026219177447909005}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.018370065672256373, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0014347031808393837}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.03157703113403232, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0017958384553041604}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.016827951371953354, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0011812390790814646}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.0673356509458437, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.002737988776916549}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.14655813236600299, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0035859866081953853}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06542274863101005, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0021692648679794255}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.07012972287523524, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.002969174912446342}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.14278060144306878, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0035650821245661137}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.0661469228892784, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002260599148736579}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..38740b50db2e2412e030d23f01b6d76e6fc33768
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.5038883821200564, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.0604617926818546}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.06764853867416688, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003565719899866043}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.0646429180266884, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003083663833624245}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.05198294358223645, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0025042512053083032}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.015459698428139546, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.001643754856124599}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.014396007314664165, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0013434842111948982}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.011752668395362776, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0011342195396425946}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.05781587124982036, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.003092606353558658}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.05674010175258449, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0027470600077108814}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.0448509387316729, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0021869950723516175}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.060347710618179395, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0032401079418637705}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.05744446846929318, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0027724301585786448}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.046043469207693984, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002239044296435396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..162bebdc86ac635e2ece2ad7d4c7f15672b90419
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.417256690773128, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.05417089218251962}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.07310603926070969, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.003697200472520524}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.06537860586537475, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003132408607889633}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.055698333051118806, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0026311711420543356}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.01698481574652303, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0017805476081186934}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.015257654221563844, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0014614481898404892}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.012670702709971336, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0011548610533438693}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.06317321591157367, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.00326988264639926}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.05769516321706676, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0028323811792588284}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.04835453953831574, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0023265907591892786}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.06558154945562647, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0033791812198662656}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.05918648566874358, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0029013049659238066}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.04994572581381962, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.002394485017637533}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..60654c98a205be90e2d17069e5046f7fa04d8687
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.5230628724174664, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.07819793788240743}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.08730073095091566, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0039923132219365646}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.07777229717916978, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0033592902246069372}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.06832044026429208, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0029205130223798324}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.021287957435449874, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0018928999405201379}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.01847896121529526, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0016529295281884947}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.01641917483304456, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001396907601907277}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.07479886960092183, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0034906398849194483}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.06818337939770107, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0030433206990172095}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.05888348175750054, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.00257744328152746}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.07758866056449988, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0036373270512517343}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.06936506932934977, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0030738111400345396}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.060425478261301964, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026348754382351006}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb7b128dabbd0375dccebc8df2dfe1bd2ea119cf
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.6835675289713131, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.09201535652694026}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.0970156487496315, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.004126042501770336}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.08622233259578811, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.003503196213825024}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.07518330292771172, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0029704830489176897}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.022354315529872364, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.0018287202748284283}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.018862588281324987, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0015455208203762356}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.017013197441592458, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.001321185275390085}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.08321458322323663, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.0036022749449818596}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.07523134647928398, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003155331153332636}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.06482059902196627, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0026255826270300616}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.08611431602639111, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0037595269875219453}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.07664158416334983, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.0031935933997685262}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.06646799284350768, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026802307096209666}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f65cb9d104a78850fb875ae8c70c63f455f6ce87
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01166526473007815}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.49510337323177367, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01166526473007815}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c46806b9e55d75c3944497d5fa55b2679af44342
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4940152339499456, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664988455853323}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4940152339499456, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664988455853323}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..764d11fd306f29e64531a66f78266dafab09d6a6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5070729053318824, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011664656918145945}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5070729053318824, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011664656918145945}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2778ffa626e22ec562ac39a4c3e1d184e11d03bf
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5228509249183896, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011653634832401168}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5228509249183896, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011653634832401168}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..389003e3b6eb7c70c2094a563bedb4aa06240b19
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.5092491838955386, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011663828032649181}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.5092491838955386, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011663828032649181}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fedf050cee5eb82d9509e699dee2637f22721063
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc": 0.4961915125136017, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011665485744746797}, {"task_name": "piqa", "prompt_name": "pick_correct_choice_index", "acc_norm": 0.4961915125136017, "fixed_answer_choice_list": ["1", "2"], "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "44778818-7b73-4262-a294-c00fc32b6c2c", "prompt_jinja": "Sentence: {{goal}}\n\nChoice {{answer_choices[0]}}: {{sol1}}\n\nChoice {{answer_choices[1]}}: {{sol2}}\n\nWhat is the index of the correct choice for ending for the sentence?\n\nAnswer:\n\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011665485744746797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..36ebf6950ee43a5509efa82e2e574c3edd0932e2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.558215451577802, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011586482494310218}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5603917301414582, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01158041724865657}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..731deb382f4287a45ffdf82a2f2b6d5edf00fa2c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5522306855277476, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01160199979686681}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5527747551686616, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01160065944329292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1e840c856aaaf86c77bc26d752a1d99ce2b0055
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5500544069640914, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01160722083798011}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5495103373231773, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011608491028638188}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..40c89df4da8170f31c288b8a6480c1f583bcc6a2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.543525571273123, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011621538875661537}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5424374319912949, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011623729421518137}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4864e1560dc810690c69c8f0b3a9dea746253d3d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5478781284004353, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011612217507379627}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5478781284004353, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011612217507379627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..15e870dbdc5153de92d6b06dcf92cf1c85c26633
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc": 0.5522306855277476, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01160199979686681}, {"task_name": "piqa", "prompt_name": "what_is_the_correct_ending", "acc_norm": 0.5516866158868335, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "16e97a16-c958-4956-bfba-279f88dafd5b", "prompt_jinja": "Goal: {{goal}}\n\nWhich is the correct ending?\n- {{sol1}}\n- {{sol2}}\n\nAnswer:\n|||\n{{answer_choices[label]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011603326108334502}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4562a22f41d63ab9bfb9f1e9484cfe2c03591ae6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.598, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015512467135715075}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.542, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015763390640483706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..05c84d57d795a9bebc0fe5f96ce40a11ee319fe5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.662, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014965960710224475}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.645, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015139491543780529}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc27cff45500eb4a50c8bd8c33dd52b2700a4465
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.664, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014944140233795023}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.667, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6bdac4046aab21e46b36e45d536153eeacd66a0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.675, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014818724459095524}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.692, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01460648312734276}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3982a34c0ebb198fdd73fb3b6f19c0fe770b135
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.687, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014671272822977886}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.701, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014484778521220468}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1255086cc681ef70500b17ae0fedcba4d878e0d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc": 0.692, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01460648312734276}, {"task_name": "sciq", "prompt_name": "Direct Question (Closed Book)", "acc_norm": 0.691, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "0af52ad2-2b12-4700-b664-cd26d2da6dc1", "prompt_jinja": "Q: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb17011d71eda2a18cc99ff1febb03b3d15efc98
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.855, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011139977517890145}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.77, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.013314551335935948}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..714f6ea65233f12dc9c5550f5aa884b21fa044b1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.906, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009233052000787733}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.897, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.00961683333969579}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ee52467103f3757a81a2c4c4e0a27c7e2d82eb8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.913, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008916866630745894}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.91, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009054390204866435}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9699160a561a723e3e8dea5f4f4677c057625c4e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.921, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008534156773333456}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.917, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008728527206074798}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..af7e9ea3bf33a566153d4986c023bebc19f638bd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.922, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008484573530118583}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.919, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008632121032139978}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1785d7edf1a008041b87e5e24e4b4826703ac2a9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Direct-Question_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Direct Question", "acc": 0.92, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.00858333697775365}, {"task_name": "sciq", "prompt_name": "Direct Question", "acc_norm": 0.922, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "d417fcfb-9f00-4186-95d8-e63609495164", "prompt_jinja": "Answer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008484573530118587}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b0cd733d5d4be1d5437d39cd8f413e5711f5fc0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.342, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.015008706182121731}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.362, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.0152048409129195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e47c8615065c11f7dc171f16834bed1b901a48a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.341, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014998131348402702}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.341, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014998131348402704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6eab15663e44d28d640eb7f69a43c5e15fd01ee
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.335, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014933117490932577}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.337, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014955087918653605}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef9d36f028b58b2a0ce434ffc3d5fa0ed59df3dc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.317, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014721675438880215}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.335, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014933117490932573}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c778b6a223e2f78b8731cbdc91c245338e8ad090
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.328, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014853842487270336}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.342, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.015008706182121728}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..279789a0787e7d9059f16af085f1b88c9a89e215
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc": 0.338, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01496596071022448}, {"task_name": "sciq", "prompt_name": "Multiple Choice (Closed Book)", "acc_norm": 0.334, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "15b0a989-84e4-4f1c-8ac1-12dbfa2ff42a", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014922019523732956}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..41e6dc874413e2fc048f3edff10d99315a74b095
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.329, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014865395385928364}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.342, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015008706182121731}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..377fc75ec910167ea37903a4963d13b3ac90ddaf
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.304, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014553205687950436}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.318, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014734079309311901}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..acec58c2270827846fccbf8415b45fbddfc6a7a6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.294, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014414290540008213}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.311, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014645596385722695}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..008b838bc341fee27cfae3c0bcd34b63c396fefc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.292, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014385511563477343}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.314, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac153970bc59f2fa9d651586ea9dca891e032d78
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.316, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014709193056057121}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.332, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01489959724281148}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..569aff851d1f70986569ade89e54cdcc9f04656d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.328, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014853842487270333}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.327, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014842213153411237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a19d6a1506749df524ac58708fb91003cce5986b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.359, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015177264224798592}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.36, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015186527932040126}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2e66f999ff0f169f8f8520a9880f62ccd353f29
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.359, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015177264224798596}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.378, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015341165254026649}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f49ddf505a906eb022ba5b516dd5618489bff07
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.322, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014782913600996664}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.333, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014910846164229868}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..58010818c40d11536dd3dd1e15290fc1717d5f3c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.339, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01497675877162034}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.351, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015100563798316403}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..155d8a84893efe65b61aa334bf3b7ba8d4eec073
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.346, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015050266127564441}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.358, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015167928865407557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c77f3306065c192917e1f5271193b313d355b0b2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.346, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015050266127564438}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.358, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n                             [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n                             [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n                             [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015167928865407555}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7de7c3f249ccbf2245570b9fe153ca839881d0b7
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.5104222340994121, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559920087347776}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.5248530197755211, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011548139823074772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..984c265815d3ca12750bc865d35e2a6f2608f70c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4778193479422769, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011551049647290314}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4965259219668626, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562153149168287}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dce5085e52b94200c8593cbbf186762a0598eb13
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.45911277391769106, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011523708060182086}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.47140566541956175, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011543509045585211}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fbaa9af30d39640f12f5d1bf3eecd56ea41db64
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4489577765900588, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011502027057558886}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.45323356493853556, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011511744771088355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a7522c527628b6b25338f5b1b595bac5da3920a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.45056119722073756, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011505771738769861}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.4452164617851416, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011492819519292359}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa30d35355d1245e2f783bec7712e5bf6e309eee
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc": 0.4409406734366649, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011481489309428048}, {"task_name": "story_cloze_2016", "prompt_name": "Answer Given options", "acc_norm": 0.44468198824158206, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8", "prompt_jinja": "{{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What is a possible continuation for the story given the following options ? - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011491450380971893}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..667f720a92d41c87f8bf34dad700262eaaf9d9dc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.5104222340994121, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011559920087347778}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.5344735435595938, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01153491734135513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d22a78dbcc77f671e90ad57495eb799cba66bc9c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011545573278697237}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4991982896846606, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562417388300208}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc96199cddecb7ae47baeb972fa3e072129edf28
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4559059326563335, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01151738312396153}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4681988241582042, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011539022035111231}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd9938dadcae7693cd6520d9ec54f62e68f3b0ec
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4585783003741315, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011522687288692525}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.4730090860502405, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011545573278697237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0fb78b931d770647672f3df4e514585bd1b1ff1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4462854088722608, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011495517440721683}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.44735435595938, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011498161586686657}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e4e8f7ec099f6ebe610f825f928c190c9b0faf2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc": 0.4575093532870123, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011520605695184077}, {"task_name": "story_cloze_2016", "prompt_name": "Choose Story Ending", "acc_norm": 0.45537145911277394, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1", "prompt_jinja": "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the previous story from the following options: \n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011516282203726656}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..28c72bf6463c3e3455eb34e55525821cbd9cf054
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..401a48a8e088290d8dfe9749dce0f3c8e6606786
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..70d4b5fd1448944432a85ef7a9c794f078e3c89f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5456cd940bdb1bfdb56609c6ec167b5bd7be07a4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..fed91796c220d51b5e0a63315138d903a144fc13
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9a41e8acef46dcccb488a5cc8880f5c87150cc6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1 @@
+{"results": [], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2291f8576f4841857db072477a73e33a3ae4918e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.515766969535008, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011556682042196382}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.5259219668626403, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011546883081384905}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a09e8931349710dd21353ad83a4f8f744d7c6a1f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.48583645109567075, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011557792331301667}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.49599144842330306, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011562060664045738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..73593f8a42fc504522e9bb9521de258853d1feb9
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.46285408872260825, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011530479981182623}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.46873329770176375, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011539803085637733}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e05d8df64dd9960bd0db18e8a749f5eaa3ba920e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.4585783003741315, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011522687288692525}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.464457509353287, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011533182338113986}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae13787e9bdbbaf31c0d678aaafffa89ab4a078b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.45056119722073756, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011505771738769861}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.44681988241582044, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011496846233300528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..93d0537e933216377f941c6302057ff62abbe9a0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc": 0.45163014430785675, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011508201145928352}, {"task_name": "story_cloze_2016", "prompt_name": "Novel Correct Ending", "acc_norm": 0.45163014430785675, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "c6f3d802-4f97-449f-a911-03470d418f7d", "prompt_jinja": "I read the following novel: {{input_sentence_1}} {{input_sentence_2}} {{input_sentence_3}} {{input_sentence_4}} What do you think is the most probable ending? You can choose from the following options: - {{answer_choices | join(\"\\n- \")}} ||| {{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01150820114592835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4894da855319b8f364c8c2c710aaec5bf70a875
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.5034740780331374, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011562153149168298}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.5285943345804383, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011543509045585206}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe476e5ff9ab47d60521940ed66877339245def0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.46018172100481025, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011525709570367512}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4949225013361839, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011561836054238772}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7af78bbad4503379bb0face03f3a533bad860fb
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4521646178514164, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011509395748220111}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.467129877071085, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011537420054210294}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e04b7a59949adccf471845c2c54c815d5277736
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4478888295029396, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011499463505491369}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4596472474612507, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01152471548624064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..21aab9ac74c3c9ff765c61b094f2042efb64d87e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4393372528059861, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011477017982308784}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.4398717263495457, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011478521926587444}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbf20131646d1181a27330c41d323ff834512573
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc": 0.4398717263495457, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.011478521926587435}, {"task_name": "story_cloze_2016", "prompt_name": "Story Continuation and Options", "acc_norm": 0.45056119722073756, "fixed_answer_choice_list": null, "dataset_path": "story_cloze", "dataset_name": "2016", "subset": null, "prompt_id": "b5c8445f-2d3a-4691-bdd5-58956816702f", "prompt_jinja": "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.011505771738769863}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..79cfb3fdea62b923e7e728381de8c7e57b2ae35d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5306859205776173, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.47653429602888087, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030063300411902652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cb63b729db0313c6c259732b8fbb233e5cc69b8
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.48014440433212996, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317194}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9a7bfe2e70b8ed0f0ed8223f3c6505984e2ae32
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030080573208738064}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3bec362320d7f8bbc4d0c15880f64efa31aaf4e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.49458483754512633, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3752c018f3026f87feda621c0ac23a638d605dc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.4693140794223827, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03003973059219781}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48014440433212996, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8636e62881c6bbb6d79ebdb76697eaadd5b87bbc
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "GPT-3 style", "acc_norm": 0.48014440433212996, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "8fb1c6aa-20e9-438c-bece-c6af1c746449", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True or False? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a96c2bb18a0778e0a8ff2734eb1c136cd0df12f1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4693140794223827, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030039730592197812}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c8779740bad86bb2a0b8e226be29d03ecc65b50
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..41b44eae2178983f4063e58c52d1ec396a56f6aa
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030096267148976626}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..21f0f991d033b1a3fb55ec42e8e5cd9a95cc1531
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.4981949458483754, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3c87355865ce181f8876aa639e56d9cecc116bd
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5306859205776173, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03003973059219781}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..2996054ad15b8c3c7dd6ff1b07bd58eabdcd9c57
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "MNLI crowdsource", "acc_norm": 0.5090252707581228, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2b52a83c-0021-41fe-b44c-5aaa076d71a2", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, is \"{{hypothesis}}\" definitely correct? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe48323b7674ad313e43d850a41d0165ae55fa9f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.4296028880866426, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.02979666882912467}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc28856eb2b6a362f57bc2ff7aa581f71a0131ef
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4a819251bc76a9212e877add26ebf256a20035c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac1d098de7adf6545df665fefa4a5888c33c282a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317177}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317177}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad8818da51b6e54affa126ddfab62803561bf207
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030063300411902652}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd58c2331c508597a0c3cc86ba3e1a86d9682abe
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.03006330041190266}, {"task_name": "superglue_rte", "prompt_name": "does it follow that", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "e2fb58f2-b1f2-4aef-b74b-c4ee1c571fff", "prompt_jinja": "Given that {{premise}} Does it follow that {{hypothesis}} Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f795f0aa3d01c778928b67cfdbaf6c2ed5d8843
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.48375451263537905, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d77080e395d58e1597a09c15efaff0eb9818abfe
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..08de87a7da0186cfd848245e142292857a1af369
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976633}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2752a142f100c5994dcdd77200fd70360167e86
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5018050541516246, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030096267148976626}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9faa31da2647637180b0e62a460591c49b15a2ef
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030086851767188564}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..48a0eeec35b461dc4c04ab359e6721b837e48373
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc": 0.516245487364621, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030080573208738064}, {"task_name": "superglue_rte", "prompt_name": "guaranteed true", "acc_norm": 0.5126353790613718, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "2d0d63da-ffcf-4f6e-941a-b8da922be43e", "prompt_jinja": "Given {{premise}} Is it guaranteed true that \"{{hypothesis}}\"? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030086851767188564}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..51f52bc6796e22630b23a584592120144a16a2c6
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.47653429602888087, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030063300411902652}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5270758122743683, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030052303463143706}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..72d755882e965d15a3bd8d0dcdf29e0d9cfd3c6c
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030091559826331334}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.49097472924187724, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030091559826331334}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..83616d2fd9bd11f605638c4c913380eced0bbe52
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030094698123239966}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5054151624548736, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030094698123239966}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..71ab2838158987d8bab105a8211286c8bb7d2be1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030072723167317184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..158d577f24274a2fcf9f98dc303208cf94d4aa3e
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030025579819366426}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5342960288808665, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.030025579819366426}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..08896ebfa9ce79216bea30967a0121be0c72cdd0
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "superglue_rte", "prompt_name": "should assume", "acc": 0.51985559566787, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.030072723167317184}, {"task_name": "superglue_rte", "prompt_name": "should assume", "acc_norm": 0.5234657039711191, "fixed_answer_choice_list": ["Yes", "No"], "dataset_path": "super_glue", "dataset_name": "rte", "subset": null, "prompt_id": "b8dc85c6-28b6-4340-979a-8e77c2a0dde8", "prompt_jinja": "Given {{premise}} Should we assume that \"{{hypothesis}}\" is true? Yes or no? ||| {% if label != -1 %}{{ answer_choices[label] }}{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.03006330041190266}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7799ea9d3855c8d37b3d96b502250e34c4876991
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140492945362904}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.505130228887135, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405174596179051}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad2efdb4f7bfa62e25eaa338745a981068618f2d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5240726124704025, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014036189665395134}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5122336227308603, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014048278820405621}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..458000ef0eca276e10827872426d06ae59e00de1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049516}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cea7c856e5ed6492781f025a797a6f89cdcbdff4
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.0140492945362904}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5138121546961326, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014047122916440422}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cf203023b8765ec32cb7f6d6b1645e3cb4f9861
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5217048145224941, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014039239216484626}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5169692186266772, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014044390401612967}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a76113d9c98ed5ef2b03b1d6fe5ee41d04baebe
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_Replace_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "Replace", "acc": 0.5382794001578532, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014011242594964123}, {"task_name": "winogrande", "prompt_name": "Replace", "acc_norm": 0.5311760063141279, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "e7e42e9e-bc57-46ed-ad8a-76a5b90a5bb9", "prompt_jinja": "{{sentence}}\nReplace the _ in the above sentence with the correct option: \n- {{option1}}\n- {{option2}}\n|||\n{% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014025142640639513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cc465c76956d46a1a17cab66278943a0eefd182
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4956590370955012, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014051956064076896}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.01405213114691586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2018bacbe85ab7d6aeaf6adcb15a082df5c2e51d
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5011838989739542, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052446290529012}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5019731649565904, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052376259225629}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4dbe6b7a438a74fb3b355a0df8c3fd9b9b42d624
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.4940805051302289, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.01405150083848581}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4980268350434096, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052376259225636}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b43724ba7301eb869f6ccf6ba8d4ff1c79c651a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.49171270718232046, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014050555322824194}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4940805051302289, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..53fdcc02da86bc024472e12d0887857898901ab5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5035516969218626, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014052131146915857}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.4964483030781373, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014052131146915864}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..23d0791dab886d6e173e60438454a991334d5301
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_True-or-False_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "True or False", "acc": 0.5098658247829518, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_stderr": 0.014049749833367589}, {"task_name": "winogrande", "prompt_name": "True or False", "acc_norm": 0.5146014206787688, "fixed_answer_choice_list": ["True", "False"], "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "d11378d1-2f24-4509-bbbc-bfa2921300d5", "prompt_jinja": "The _ in the sentence below refers to {{option1}}. True or False?\n{{sentence}}|||\n{{answer_choices[answer|int - 1]}}", "prompt_original_task": false, "comment": "", "acc_norm_stderr": 0.014046492383275835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc5d79929877a124dad1015e2d1617949715d6f5
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5130228887134964, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047718393997663}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076896}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d1b77bc699c6628bb350e65cf61fed2105f4059
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5146014206787688, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01404649238327584}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.49013417521704816, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049749833367596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e8f0ee3b350c04ae8d409269dbadff9f210da3f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4861878453038674, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014047122916440419}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.48303078137332284, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014044390401612978}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a2dab07754f30b8ece6cd7626a9db3c7107c585
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01405237625922564}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.4940805051302289, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051500838485807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d56c376592db611d6324d317c8f65e1c42f576
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.4711917916337806, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01402914161590962}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.46408839779005523, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014016193433958298}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7021203704f54ec9ee38d3bbf57d2bf8f9404abe
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc": 0.48539857932123126, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014046492383275839}, {"task_name": "winogrande", "prompt_name": "does underscore refer to", "acc_norm": 0.47434885556432516, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "0f23f058-5b4d-42a3-92d4-5d60688aa90c", "prompt_jinja": "{{ sentence }} In the previous sentence, does _ refer to {{ option1 }} or  {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014033980956108557}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f12941e6bb10a9b9c830b9e966d078d0a391a142
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5074980268350434, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050905521228571}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5011838989739542, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052446290529015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..65e5ede7fbbd7a345adca871e50dc461d941b333
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5185477505919495, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014042813708888378}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.510655090765588, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014049294536290396}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca4eb8fa54b5ada0a3339d45b9d1100c938ba100
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5114443567482242, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014048804199859325}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01405213114691586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..24aa6be90cde318f7bfefe23a241b23538267c08
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.516179952644041, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014045126130978601}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5146014206787688, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014046492383275835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..43fe63a44abf7d7c1d533b0f339241d83632113f
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5098658247829518, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014049749833367585}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824189}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7db766c6d0e4e49b837677bd7d009e5f24b8f98a
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_stand-for_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "stand for", "acc": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014050555322824189}, {"task_name": "winogrande", "prompt_name": "stand for", "acc_norm": 0.5067087608524072, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5080f912-fac8-400f-983c-944baf9b10c0", "prompt_jinja": "In the sentence below, does the _ stand for {{answer_choices[0]}} or {{answer_choices[1]}}?\n{{sentence}}|||\n{{answer_choices[answer | int - 1]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051220692330349}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_0.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8a698b09343c739c2d99be922e53231d2dc6ad1
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.500394632991318, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052481306049516}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.4996053670086819, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052481306049512}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_1.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4601693061216727e663832206d297ad058880a3
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5027624309392266, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616441}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5082872928176796, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014050555322824194}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_2.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..673e3cde237f5f1e8d7ff07316f27dda5424ee8b
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.4972375690607735, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052271211616433}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5019731649565904, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014052376259225627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_3.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..692683c58bf13aebc0016020ea0668120b1daf04
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5035516969218626, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014052131146915853}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.0140519560640769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_4.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..55f607c1f100f5c87fb4c1d031c9590007bb6711
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051956064076903}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.494869771112865, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051745961790516}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_5.json b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..57a50218b6786821a45a02e74eb2c4c5b90f51c2
--- /dev/null
+++ b/4b284b42boscar/eval/agg.4b284b42boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "winogrande", "prompt_name": "underscore refer to", "acc": 0.5059194948697711, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014051500838485807}, {"task_name": "winogrande", "prompt_name": "underscore refer to", "acc_norm": 0.5043409629044988, "fixed_answer_choice_list": null, "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "subset": null, "prompt_id": "5af00ba1-86e0-421b-bb97-26bf58df52d3", "prompt_jinja": "{{sentence}}\nWhat does the _ in the above sentence refer to? {{ option1 }} or {{ option2 }}? ||| {% if answer == '1' %} {{option1}} {% else %} {{ option2 }} {% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014051956064076903}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..daba122f2aa67fd3f666e14fcc7b61caaf6efd57
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3397ef481af8fe0c38172475ebdc8575c5a6501dd65d01b46710085d236f747
+size 8252512
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..12ea4fd18c67cddb9bd268510fdc605940006558
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b412ef6ff25d1685f3ef2cfce24c1def2ffedc59651c4f92ae6e6f40214e5c64
+size 9268068
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..28c1c20abbceab802b839b15a1437ba9d9bc3bc4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cdc6d76f3d5a64e6d50dc77a2bee47135d1cf28dccce54ce8a0fd8922acf1b3
+size 5539966
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f71cf612cef29648c8ac15b9fe6afa6d7c5d44a9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9e4d89579b4ad9454e9bc89bd05136c079b6f9cf9c242c966f62b1558102a44
+size 6449146
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6fe6d0b63c27f3962a94185911de0bfe1f439235
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ce7a556f0619db806fe1066df53b03f48dce3325102a2db7b8bc617557b7b3f
+size 7300675
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60e0ce4365863c1ba9a97da912aef0727f7ef391
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55fe4907d7fb398b10789ec47ee4c7c7c6ea670258c4e8af3876f27cf62cbdf8
+size 8165397
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bebd4fc3ede74ad3b9fbfb6c556ad4d7d62d0013
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c671c6d4156cefa36c38fa694843ce2d83af93e1ba35aea205484701a087e9f0
+size 8825120
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f481cce180235b2ad1cf56b6f44a3199add29558
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ca44b0c354fa6801ac036ac879aa21fb93e5ea6ea92600701d059629fcfb325
+size 3403485
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..300cfd9d2e129414056aa1c826e207d8219bb06b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99af3921b28a040db3c9dd52633a8a17325a253170f01fcde9d7f95a8d0dc784
+size 4038296
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6f51c751a3f085d31d21da85f4734521ee1ae19e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a15485b1f80108c3185039210bc7ed90a734f1ccff25f4c55d82ecd15db36cb2
+size 4773108
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6c389a102df62ac6eb4f0a014bd15dc1fc4d81f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7623e0a924e1b51a4adc449aaaffc42700d7ad372d25eed1815cabb941b93251
+size 5504717
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ac96e1ec7ffde16c76d85656f00a2743cf84a76
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:887551077ef46a4bc36589c2dcd9229e37ccced825112999cc03b8d527835891
+size 6237923
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aa2862b9fbd88ebfae428ec4e98e391182da17c1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7236887c0472f0241165339a8ca9cd7bd8205fc1f1f5bc30dbf1a44d2aaef84d
+size 8209474
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..678b5f33fa1cf0cad12d191c243989b3f91db893
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06f8263c3072213ce4c5d3282959ff2c4228ddd4c45aa4813ea5fe83ca68e388
+size 3193942
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0fb8f07a4484b47fe6bcc6955573e9d4fceaf3c1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:697faf6b430f5bb0ecd4f1b2573576fa75578f6731550ad622675307bb9f5f52
+size 3753829
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..092c50ea3dd91e1752483e0960ee8df7b419125d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb4c48b2b93a850ff99f11fdca5a60a37fa7c5d037daf4cd019013cdcc20fa10
+size 4428131
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..874f46af786d220db8d7c99672921072567215dd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9847d4ee168ca3ec3f8992acc3490c949a3367f029bd20c70750e408447a50f7
+size 5086368
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d5a34ca17b326e2dc6b19216d72aeb692212d5d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bcdadb7783b5924dd43ea329f62828c5b358edcce2c11c662ac71447cc798e5
+size 9014676
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b3565a08ed9fcdf0e0c6f9c6585abf9272a98b1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48f422a27d71812ab77dee7e965a3920f8d1e163615fd9639be9833f66015786
+size 7731622
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4900d7159e82229e32c1b91164d10dfdb6f94fd5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12d41846cb960be64914ce736a6fb83e0770f0aaf1fd21b6e6ead7aba25e004e
+size 4621171
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..062cfb624be3891764a293453fe9b4f01e5b13e6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5584e6a06bd71ade37d5fe8796e0cd6457260e1fd4482b9c6cea58a96250c59f
+size 5461217
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7973a759c1bb1f51f4d1b62294f29523b318e122
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc2ad2f9da78d7f959533e9de7282de415b27837ca94dd8e58800ba313f5c6f
+size 6312746
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6939182fec6a21de1df850d7edd18391279a1396
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8a7be4bf26b814079db7f1db86fb01ce1341e32acb9544782d6171c6e072e37
+size 10055712
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1e136061a11215221cd7308f1aaf004e7b522bac
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3be811feee85820d5fd0a3dfece670a44d7d32374f0a884437bc5efa83ee4be
+size 10428270
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cc6d3a4c56c1ee56267d5ee84317974e6f6ad1ef
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07754fc077ce29a3de55edd0a42f994ee34d541a7f2a1b4376dc58efdcb0e0ed
+size 6578948
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d46c9098f35d3d96800ac94174d86973535b969
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41dec5e78909bf6741f1cf69df768b65bc605021fb934ea9f6c52427405362c5
+size 7813205
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9f4bbacf84e9a9c5262cffcaff7af68c23cf2262
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92a87fe1f42db6ed46cb250f01361cac2648fc6c7c598254e9faefeef6435723
+size 9110544
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5875625fc8a5b30359f5ed59bafb9ea28e031114
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:706dd1fe1327521f74c298bb612d2c2d6581355f3db09e01098019b1d9658f21
+size 7655808
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..52ccca9c54b2a8e830009545660d5cc4009ee489
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8533d057313bd2952deb4c0c64a31e2f537a4cf386e7c1200695f75d6b1d2fe7
+size 26693896
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7bcc0cd96238f9fe08705d3a202a5f6e538d1a98
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6753ee085b334978148e5e75caeb90d85ae1ee9d8a7c5a0de4e1c78f5b69a2c2
+size 19004423
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60300644e81a71e322d399aec1274e5f96941e01
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7c57e940ace96eed0ceceb9e6d23f1441b9c11833ea8963cc84f936d3c4bcf4
+size 24523448
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2c234999228e56834419a9baa5aab562eb9bb6a9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d89748db56bdcf6010d1157fcaa258b1c31be0d09ed410c431abf4d6a021743
+size 29833321
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a839e1f33b2fa76d064e024081e62bc18d79fcd9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac184776e5875ba20e8be15179018c79aa04b27311b36f63e06be93423f4c343
+size 35281580
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6b25c832028dd56ef6c1398f2ff3ca2774b0a90
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ad818237e79df8e4aeef28bec4413625e3b603ab5fb696dfe1debe509ede058
+size 7744913
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f42fa0a2e2383f1aeddac8ce23a9bcef9bc9933f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcb8e7428fc2ae9f48578dfba1fc2e4deb80db92401d3a84d62ce35a44814796
+size 26615194
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f039f9b1c776e991441360831a82b2afe8d6f340
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3216fcb8c31fcd66ae01350be1eca5a803be2bf8f93ee025f45462076954d062
+size 18978461
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2021188c3868fc6b8c403fd7759b29bcd897957a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d515a1ae828dc1452c1e90e5815537966ec7b8d6484205d87477a560cf092ce8
+size 24537858
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7d6113c9bca7e7ca83cacc420e4acd0ebe06da91
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d27da3ede28d2db7e5254a1d44a2ec166cd2cf4ce2e87dd85c2a410d9f466a87
+size 29929888
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..42c5e1aafa0e735cc9d367b39e816e777c82d042
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4c9852bcb8dff173c55cc813c14faf7676277e32c6abe07fcc87b4bf2808d4b
+size 35424512
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..82f3c03c45d0da2fc52c16abffb27a8c8be9961a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ce2bd5b138f3958b20320d2a776e3b272c49912e950b94774c05915237acaa8
+size 7619255
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eee4d5f361e3eba695b0c81c6fde7e8dc35cc123
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9652778d8b27a37359cd8aa1c80eadc7f9d0c6bc377fc012d5e7fd8fcafc6a74
+size 26777458
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0124ed8f60614867201dcc0c535016c2950f9b11
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfd725b7b580ff0c1e2fea4ea6c84a1427926cf39303ca9e800c86c29724ce94
+size 19087213
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f8b3b85164812d8d5f662171bba4a91469b8c4b3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51dcf54d3558b62c309c7aa41e984cf8c8bf223fd24266d0c6698dc500b88f02
+size 24644676
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d15ab4d5797425b8aee8a0804c3f6becf255127c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b7913e7db269d3f076e96e0f6c0197c6b6e7e78ae3328dcbda683f9860fd61b
+size 30046367
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e3cbc58abd1122c6a6d47e5ae8460190015f6ab8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb975b7b24bb7940f0a7574269b7aa047bcbded11c09e8f08accc28c67e85edd
+size 35570444
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c528bf9d9d825a5e4617f210a68e9c3768a294e9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a5b598a497f006f1437f0ec4b8c7ccd6cb1ffbf33e72a04e9c983fd4982a1e
+size 7334204
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0f70dc55ed3cdcc452564acde3056de6159fb37a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df8b906965cba25049b162cac44501fa4fa07a768270634134e244273a50ee72
+size 26130956
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c41116bd44b736ea70798fdc86afc5d6d3394df4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9c913819d4cf01fbe63e8a6644e1ce246b1e9171f68e4dfd1a9a5638fe2a211
+size 18684310
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97d33bd1e24e7f78a7cd6e0dafd31e92a2836cbb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b114956855b37bd7c3612b1ce538904aee01698572f53ea0356605dafddef633
+size 24126919
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..613cba4d88d9775fca977b9e0b6157a110c181d5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:188be788d7857377f944469692b37f6697aa682b20222597041ebe151c789959
+size 29394906
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79a8f53bf6e42dfe5edfc239d6c07458cc718dc8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16f61440b6dc7ba4fa37bcfb54cc775f28efe4e0dffc55af134f4278de76a712
+size 34785534
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fc06f2c2876bbf4b7174a432b462af9cb0bcc837
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96de4edf8b6ae5613b260f8f5fc1bbe2bbb8c73dfcf3eb5f5bdb82d266386cfd
+size 8166962
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a6d5c5d2b6be9a0a5d5b3527aaf9bd6802ba5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:001bfcb9125099b7c60591a6cef65b900c7875a103e038dd4829ca23d1110492
+size 27568608
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dcb18655fddf7373ccfb5ca31f98a0b5d371dc69
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca2ad316c00314af1d1948e7d42d077b420ca5966a170d637e02f84cb1bb9d1f
+size 19742846
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d78005ad85835c276e9274926b29a8df9c6c30c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9064bef5d5d61e4782202653d423ba741a658ca785a3c6caeacfaf9bc1a30bb8
+size 25379384
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..89fac0b86e0bf6e89577c1091e915faea38de962
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30a88a52fe993fda3153e273a889aa5e70469f9ae9733da2ba862344175e75fd
+size 30821464
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e8b2fe2874468c9d598d2a9f65564c7a47f55f0
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3ef31edc3e85a2aeb2c8f2c9cc5f1da4b027ae5ddfc375e0f94fad45c6b5aff
+size 36464749
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23c491dfaa93da8f08fd246a7ee116553a07cdd5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30e91cd76c604828b79a98d01ddd8285578e8e022032a6e8a9a699c4d2efdb7d
+size 993319
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d222821fc8f523cd9c0951111be666e36d14a29a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c1ef4046926ad8ed15c9ce76d2d4be0cf72b40ed44e6cb238248ae2eceb5251
+size 1452509
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cf4d64f292f401d1b933e0ad5ac9d35495c8d5f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d561e6d4db4a50f5a992362d85e9823d899b82218e1eddab8511075ac3ad51f
+size 1911117
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fcac0bad23c1f3adaea5ce35c348a0b20eadcfa8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:233bf94c32aa64b3b1d097327caddf4892b7b364ddc4961cd7e1d08c317cd85e
+size 2368518
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ff480ce3d2b2fa78154bd94e10e78a96cece2244
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5373205c0c1751a7be27d3598791540a09ec25fbe26c29e6cb273e8d7015a809
+size 2823025
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..522ff4476f2375df4e0d21dc2f08cec65c654fc8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c092c1a387b647ba87794bae83e0a8ae204258bb82856cd3c99181cda655e3f8
+size 3279370
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f126e0959deff5491d8422d6fcd1f743b13cac70
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0024c96950ba66a01fa91934d1fa9dc8c81bd84e39ef24f65ec5465fa9f20ead
+size 1203131
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1ce878e0baca92bdffc5e1e95712a14e4725e7e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:701bf2d17666b683084b60e3615d0beb15979b4cea0af420cfaf229c978efdd9
+size 1755006
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..703fcdfe83a9f03f162be435d80f60bc32c2dc70
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1eb4617f740bc2e95a3177b949596d6d6ab08abcd06fa97d9e2cbb3fbe6a2627
+size 2304224
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f96da44c9796bc236fb56ee3e11ca4f7f4718fb9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f67a53a16d5653c42b983c177f5a9a508dcf5dbd7bf55b8d712002d52637e17
+size 2852304
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbf98464895543c4d69d8db4aa826cbc471f4c88
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:640327fa5e653b7c4ac90e341af17fc1eb6783d24b7651140b33b67f00c48e11
+size 3397894
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..006a1dc409240f5002f622bc9f62c7567fec82f9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a283920d9bfe6d8d7f9ad31f315e7042cf2d152e7b0bf300a9c1ebb1a9723b1
+size 3945362
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b71c2269963e08200c4300ed59ee6d44042dfca7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:183828a44c05652c77db9598054a39fdfac2cc49847a3325a2f92dc9a9785f7d
+size 1007544
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45b2782ac4652dd6852bf2dc18fbd3a31ae6b747
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad51886490d5679fe5e10ee061bd78e0fc61af278be4818698798c3a75e0c91
+size 1478640
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..04c32b52484013467039c9ab2c9212aa43c2edaa
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3741ad93469821194505d8192d9f63c00f022200b2cf1e0ed92b4c03defc2bab
+size 1949814
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..519402fe8548a91a43995b434ff5f910e472cd3b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9448ceea07fe8a0a5f74d3cad88fd1b9cb0fd1e24d9cc37ffd5bc66d97b28723
+size 2419244
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..73031b2443207bf12119f88a0c9eb7d47d1f8111
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20a5895a9ff562c9535c707f224320daf6ffa77028557d63a19ef16a2d07cdfc
+size 2885733
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e4664ed34183db7417114434d636926bdd38b028
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c835e5f0b1fe9b0e15ba7a45e3f1ab63f0f8f145049517c044003529f4674905
+size 3354209
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d2ed6c2d70b8a9802a291c7f6a23b3eaa7ed31d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:734083a79996c631149d4dd8245a00cd486325bd2a084311d3f99c2fda44f2a1
+size 1160116
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b1ae935a7c590183486cddaa6120c7ffc8f2c153
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6de1bd07a042b59381072bf2e2fdabf7e54ba3753f4bdaa248b8bb9f573115ca
+size 1668613
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a36d5d50f9fd64484427e4e0cb36f71d6b8ec5d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2080f8f3610c4f36755f1a5f7acb881ece7cec4c0c5be93169bda157023d556
+size 2176445
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..132b8cd45812e548de90f1d463070f1003d52f02
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f88b2ddc63077fbb0a01e4e9473b8d89d4984b37b37ecf43359cc0bb9f85f28f
+size 2683213
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..08d1c34bcc5a8424d8c857d2ba5f60a8d91c496f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5c63e7496c39bba4ad2d1645af4b647b5f3203db9d68995435d2cd7a08db3da
+size 3187136
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..107f31887988fc749cdda923a4bce1fa581f4cf3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db4420c2cfa292563336b79ab2a38a18b982e590e98b44268870dc2ceffc9d3d
+size 3693097
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..17790c71c216a8a3df39fc5d54786131c75c7a98
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e0d83df690aace4bafdc186126d02956fc0ef7d194a37bec14666456c45e9e0
+size 1026387
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f1b2f7514c28b3d0e60f7619687b86e35612ea5e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd21b3162cacfa57437728e6abdd185de9efb3bad975aa59294b76c811a384f
+size 1503640
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..973885afc045837f82b8e2308fbd885cd84e2e0a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4487a45d7277b265fcb7e155e2371bb097cecd7b62ace0cc6ff9952b981633a6
+size 1980823
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..490578a339fd5e6fb0720d07f6c33cdfae5c8156
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6433e4bd3a3136c2a26c9303708179c63721698e5773234bcaaf152f65d8224a
+size 2456368
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b3a5b1e99a3fcc433939c7855fcb5591b98babb7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec50ad68e1ae464197676d07ed0ce25f3bb2811b3cf155614fbaaf754b0bd547
+size 2928837
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..beb1d076278473c0bcaf038cc526c5f978f58095
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r1_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49c43c6b887f406896f7f9606d1a705dc0303c2747f7f9e4ff8986a04921e0c1
+size 3403325
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..439745aff9021af44b76c3ca9100dcaf819858c6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44271c9f1df4cacf9e97fe2c3d633b3e8aac9ceb3858c4d31a94a88a897f8804
+size 993722
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b8e440c938f88db9ba09742e7505c993a220408
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1435672f5709117502a593ff9b3a9c9e35c193c4a6d3e7ae8024b359e708167a
+size 1447613
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..21394cb00ddf996a3fdd3dc77187154cd4f6afb1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:901c5d06042c11978a3cd910f511e13bdaaf6f87392029f02afa1f51dcb46bd7
+size 1901060
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..93fea8a364841092012fa19900687b99832bbf3d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b12caf3f64e202afefbf05c73ab0dcca548512fe1705d05fa356d05d7caa99e
+size 2351039
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a3b81db898de5ea5ca89ed9f12763f826aee0a6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30951bf12ae64896426d9f8a9f39cd0cbcd8a1771e232efc3e13bd36aea8b82e
+size 2800843
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..378151ef39344920affd277e985746378a79491b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c119a772002a94cb7dcbd3ce8f299131a2f3d187d2aa1cb33b22f565b66bc50c
+size 3251464
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d6d87dbddb5472839ce7f190976d4c1698d012b5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12b99e5b3b2604aa57e246503a6bc04330f859dffa7f0a5934737bfd80f33c80
+size 1203531
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79b9effc40be6f7a0ec61081d035d801ea9e90ac
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6600d531936e7ef70031daa10256ab0157ee2e4db6c4008c3be3cb0877b09e23
+size 1750064
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..812e29e11e9a2be6ff6f6abdf92dcc664e6d94ac
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39baaca6ca57b7eccde7d47d8ca178ab2af80139eb122e84d39eb61f9263de1e
+size 2294254
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..130646fe0fad56483bd8a7c9342e2d4bdf39f69a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfad1017c9d8ae4917b08c9534097fa0bb971034bec2f78c5396e055c6cbb6cd
+size 2834870
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..10c0fed74778d2ad90ce3e73836337c7827256a1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6eb7dc7bb43a2f1682a2594128c7e6f10e28fb90616521bd8ae5271afd5395f
+size 3375834
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..51b97f19214aecaab5d2aeca42f8dd817f58495f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2066b27beff09f97af152cea8c361ced90fe7576edc15a6e7169d3ce63daa2b6
+size 3917605
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45a9b31021c7718c7b321786237283d6f0fc1d65
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae8c2c64c6ebea99b3cc853f4d977567129bbcd203ae1d5858143a4b2c0d0a5b
+size 1008007
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68c84d98d6c01085b3a64c630542741e9ce37bf3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f039a787df2357c566ae4c5d91957bc23ef327e30aa50c836a579b31a34f5
+size 1474064
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c15c643845324712b1a9e5395cd86f54d7324629
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c654bf2cf827ac4d746bb49a1100360ad76c591b57b2196956edc2537af5d08
+size 1940040
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..192768802a825319c9a83c1afb2af2ee172959b0
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8c27290e4c58e4bb65d46e261c34e26d273203898bf3b2c742dec6c78853e2
+size 2401946
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..84e67d7369a10b8b183ef0214075efdafc073d26
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f05c6c7cf7d21868674c295e0be26a0b3795e92b471ff97800872e821067db19
+size 2863766
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..45f3712ad2601a23b4c06021364848fb8815d1df
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef85a7969ec9c317603392a3c701824cf88548cfa36abc107060cac7ee039b7a
+size 3326570
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e35640db84eda8f97b0f52700beecd71c00c12de
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29055a5c94c453ac7e692f579fe25495c4273ab58ec66144fdaa87f808492a23
+size 1160518
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b65c68fd7f4a3d62f4713181423473e51c5f9cef
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee59fdafe84291b596be48a5c4d2ea5f27a7079610a1d4385a74a20d78aa4aae
+size 1663753
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97d5dd1fcff6dc1e3fea5bffce8f5dcfb5b00cf9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99734ea00743e38a04b40a5a16d6f67d6be845048186c46da7edc285e1c7e557
+size 2166044
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6dd3d4292edcba87cb1ee5c18e35272f2f088bd7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8b73d7a15aa47dc5ce0a95a06a1589350a56b635184a25292acdce0cf2f6f59
+size 2665179
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a59ba535b7f3870cafe76aa6252e8f063732f0d2
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd195a980d9447a396f9a00ef83a4d56ddd590dbcbb828d9e557183456d27cc0
+size 3164215
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..72516aeb5f8cc680040853524bc54f4495c056c6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa73459fc3b1160eefc7a1cc297fe9839f4cdbea9d9d03106c363dca7a7d9a3a
+size 3664398
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f94337d4bf38f02c7d42cc9a3ea733516e6bf8b9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25dc75205e8cc7af2460bea355484fa5b1653b2f874165933e17cd0aa65c1eaf
+size 1026740
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a8f35ee6550067ae44859052281da76b0f9fc053
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2a0204268a3b2aaf8b93a5a32c9213187264e8125ce69590237f2d0bfb6f30b
+size 1499064
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..002f585a6d90bb8f129dde67b3c6d632d195363e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee020111dadcc1404cea652bd98aa65fc9af6a618a530cb5c68c5bb2f2f2e23a
+size 1971070
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e7be2ff85ce20b9711caf0abd2c04760dc9277af
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6edf64151db0a24ae7cda9bec778dcf4be50d1827d3c6e8b00bb8c4eb4cb1ea
+size 2439081
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..76b5b4a8d9b19d5f2cb1fa7936cc9804b0c21d95
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8de90e40ef3b9c8aa5a2bd1ab8674f007b1933f213446bc124234ad7b67b037c
+size 2906868
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ecd23e1973d87df5a1be72f53fd1961cea93016c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r2_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ce3706a62b074aab1938052d3815c71debce0f415a3daecbbd8192fca63bf41
+size 3375694
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..38838a748be42c0d4dda0d2f37af1923b3b63305
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61d3b44d5d9fa3d4b3f2e1e7c1c84fdb0d9997cce08db751d6a105e0ccf0729e
+size 1169465
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00eabce7054ac46373f812ffb6bb42cd02a79b69
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:071e8e8b4c1faf1e7d0c08df201b72376b8862e5671494699ba3730d03e63624
+size 1699171
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00990a8f6692ba3c796db88def35af90e433556e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1a46a3eee8dde294cb1215d8db83b0adeed7f85311a590d841dcc8dcc450a1
+size 2218107
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4f58a50ae5d056e03608bafc30a9d4d5c092a927
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71e6410b28a374ce7d915900d8f7ba54da0ad46bb55899269f8394f31e09a576
+size 2731571
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8f6bb3e419bcf7581d7fa84de74b44a2ad368a97
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb29d4cc1b1f97a1f075f3b38b4a6b1c0b12dfd5c6fe020360c904519013f08
+size 3249119
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c9df459451002589e612c07cf454ae924a725059
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab0a178ea7f6a0800b1c4feefb1cbc88160600a4952c67d152e88527ac267b79
+size 3777680
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ec2bdb8a40803b92f06fdd1c8fb384c4e0f92cfa
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef41a0a18144cbb58969ce3f3704079b33b497abe0ca048c737c9161f0d622a0
+size 1421204
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe3c0c77f0fd332fb2b9aebbc2f2949d79b9d340
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8436189e45f167b68f379e1b616cda810a1807bc1f8dbcf03908bfc6beea0535
+size 2061859
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..17b4ef7d17eef4fddb4f478ee757c828ff82cdaf
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bfcd8795eb8cfc873410a60c7358642b23b15c82121a4787c75f8e05ef4f2ad
+size 2689563
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aecfdc167c9e3afea510c0b9b47584051d6a5378
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8616e010e4efcf38041e9280b7128dd03b19b6ebc7937e2939a1a2d84c8fb48f
+size 3312053
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..35a6434ad2ec1993229deb58d619f356ae561afd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee7b4616ab01f41935734ca27b5cf6c86ea153fc31b391c9bcde0c8f0147bcae
+size 3938980
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..007dbef7d10f28dc6234064801c6d90c2b463798
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:698cf0f54f37198a9448135eaa7ac4a9f6ff3972b16917395a049faeb49133ce
+size 4576913
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..27713931f59ff76961c77304779ee18c890ae1d6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e1a9e8f3d1a66634237052090c4814bd7b51fe571d2bd07e5ee693df409d342
+size 1186760
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5907ebbbbacec7845f2d1c100c6e634df78d0a52
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0b45ff0d80f08376f1b54c7d7510caa8528ca77df6f3596b2137db2040e778
+size 1730743
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..67abfcd27ec6602b60d54076368bc30016ce190f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df4df52547a799386f2db2e134190e52820f6067b9a9a443748e3c91e41623ff
+size 2264605
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba62829d4ee11a155f7a3f05197469a5dd73c5d9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5ddbb749e9bfb5a76e294f873a78cba0bc95760c0eb628362c3ceceefdd6c41
+size 2792369
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8476c7dd31818066e8911bb2df7d8aa515e139c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03fbc7ff4e73b7337cd46e6290b72a18b88fbbb26ab311a08b18ff14410646f7
+size 3324371
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e9ddc4faa3d252876eabca7d6926c83e6e02efb5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17d9b93d42ae9181458c3c90962b63445274463e38edb35e7fbaaa26af1640b2
+size 3867389
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c5049491007f8e87325d5ab67f0b506cea7bad71
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:187afa1b2eb5108dc6d35e64896df1ccce797c30897468ecda8fb14dbbf293cc
+size 1369693
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..34f6ccbe423bc058897d60429f2e78facdcfa3fe
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:007612b8c5fb5a8c79b96714e527d849ca9045d7a35f9c269438a0247a0687d6
+size 1959075
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3af3f6c05d43c7d3353db5958c7bc21c591133a7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:304bd1a6f1f965035a2aca5090f033814bfc51723693a01663ecdc1be1e10513
+size 2537064
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..70c8af483118b53e4fd3e96a462644185b6dc654
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be6d239fd41059ce5001e8dd28a1529e8d797df8bcd15b1fcd2d8f621bbe07b9
+size 3110035
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd65ce7b156ff7e0de69dc48ac57bd4fb7125b82
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e371fba01ea77f12022461e082a5cd06d1798ac30990957ad458417c8ead9f2
+size 3687280
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b9b87fcb62913003157b2965f81d59bc93db02b1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54cd08ed7e28bf654c045276628f0fb4e369b13426be55756380849da525835a
+size 4275467
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..502cb3eb4addf723b71b0087126e971fc3879c6d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:568c5e524322aced21e1f5df0508a816d1e68cc6c17d263078369b88112636e8
+size 1209453
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39933c2636ac39ed7b9cdb1c5a6051e44e115f74
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3f7d1d6d6548fc5354424fbd538b705ba9ce9f7bf9a237569f921608baf3021
+size 1760743
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a36014f97fb7574f33c4209e12ca752ed5c4cbd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74ffa239c724f12ad0cde0ece525e41d57910a0be4d22c3fcaa2f1c265e4ec76
+size 2301831
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..54944a1c3b9110867b9c428bfef766c5cb2f4ea2
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea10ab5e7dbe2f8e8db74d24b12ecd9eeafb5f60cb3b7876cdb2f50f934e25d7
+size 2836886
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..28e31dc62aa53e51af0af25140b935e78bed3c7d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9a857e9372e2ab2069d7d554f9f7aa6edc4ab44a0c40368bd6eb237088a58ca
+size 3376091
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..98ae15b76692b15b683308c7f790f2a616015310
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_anli_r3_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9649e676c13cc2adae10a1ac3798b5d881e7c301e1c21c99cb67714580e62e74
+size 3926311
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..46a7fbfcf1e959730bb0b8f8da43ef5a0fe6183c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97aa9dd5451f8b79e6376306cccab58401ec4aca9d898f2e804e649fb2af1040
+size 1216777
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..522435e0da08b8de6a2ef7ccade0b2e505d27550
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:438e3c9313837b905c2c0890979d0f959b4f3107186d69a827a68c603ba31e45
+size 1670632
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ed18ff9a374b3d21216a35b0d61e42823208584d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e66c2b523685aa963f19c87c5d161447b11bdd864eff119abba552834854c96
+size 2120084
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0fb306bbe9b5beacd49f7e6e42d6a9c6e7192b7b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d82e9b7f0b9aa290159fea5cab2bcec9227577a874945abcbfbccd0df18296c5
+size 2577615
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..92e22b3482d2d953dc8548b499f0b9e16a206c28
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a568ef6ff40883d3f88e587cd4b3c9e6d24d2825d38eaf745b6198ce85bfd61
+size 3027517
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a5d16a518b4e4c2d08e4d2a7c24c3c231ac7b08
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:338f5df60e3297aa5a1609ba7468b6bdc642824f3ec06a035af43005de913b07
+size 3479076
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5316b9ff946d272bdd932a4bd335507ad5ecb471
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74c87f3f49d1710bacb69849d4a223faf98f591e09994a2cc52d475c3724c8ed
+size 1458394
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..21dc8aa0f5179f646ccca9ce3a6cc7145dbc4b77
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:366df29d649b638a816244ddb754278ee3014fd33958868c2b401ac357111018
+size 1961068
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ff6c77c59489020107f45372c9dd55880f7ae0b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99f2e73251a66bfbea6c37673b5118442a86e48bec06ed84eced7462532344ab
+size 2457966
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..68253011bcfa8d0784a2c634f546209e04b44676
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9025b4dd807e107e65459e98c89ef4a295ce89bbd67a56c02cadc95a891c7868
+size 2963772
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ef3d0d91b8e5588610ecd61f8aa688043893aa8a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a67fa149c4576589d9a56824afa18f7f5e1bf624baa2b7785e5e53e8560b9fae
+size 3461631
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ed65fb4f85c76da77ae13b77ec867fa2e2ac88d8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1987b2199642b9df439a2a72db773a91ef4eaf6fe9d48851bea043fd236014e
+size 3960940
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cced3622f539477d8993c76812539cca0a29bd8a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:981a10e2855fd3e98aa6ec633d01dd9bf286497ddb0d9c9115e6403be73e9a9a
+size 1506275
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..36fae06279980aad6965888b20767120efcc81e9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d334399a09f757ee6936b7b99bc83d11b901bfa9f722a2602b737269ec75ad9
+size 2032623
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..aff63d19f9c7146fa6938165a3798ebe670f2453
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2f535e6b02262f8e10af9ba825a34e45b3ee22adbe42659482485bde8827f54
+size 2553893
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0076d1979395ffc98c30d9a0ccf61b315d9067b7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b044ebb481e5dd5e403b20769970165e507e3aeb63ca0d39bcaa468993867e03
+size 3084146
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6a8201193115ddaba2d922273ae7ef0552a68839
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e91fc047dbc3835bcdc0a76ef22b41934cb2c84c2bbb4764c54512cc2aaf9789
+size 3606481
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d35cf49c04acc62a3ab33c1f138e991e7313fb9b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:825e454ee5622a7e77b131a4dcd5b1e62af9d06067d3dc9b89ad29436a1e2420
+size 4130512
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c0595e949ae65b82e70e098ed419e122965d7e54
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8da8ef33aedbbc797d43c79f84cd209d48e385e32e7b85ae05ccac3c55dcb23e
+size 1202714
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..66ba86a7d866aa47ec99808568c5d4defaf1cca0
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4943fdeb1477fd6a06775ea2ccc793ebc2bb5af8517ffb034007e76532d90d0
+size 1638992
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6a511949939337675752f27b4181cb578937124f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66418d174a88758b7bedb31ab34c7c14071b9bb5889092ba5b3265d97ae8ee89
+size 2070864
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a1422b87d866b5274aa235421d27fa02e4c3a7f8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb4041fcb8e46b50744d58a45deb9018cdfabfd994509a368aa69441621352af
+size 2510815
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b624542f974a18c4c8f60aab99278ef730a9393b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6a3ae0c02a71468e6dc2e290e381579c233fcc6d67e592045858e51d5304fab
+size 2943137
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00d3d72516946a565892035ae56adeca97838ae6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc93f3710b36cdfc76463fa7d63058824fecf0dd88b388f7a60c9440ea72f21c
+size 3377116
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7735f62d4b2c9f3b80e72938db9b547b24c1f74e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19fe9df700a5bad27b3f441aa18941071e0ff0a62243a6fd0277acd100d78c31
+size 1187423
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dcc83cbc115ea06e77bc3360c7dff109e2e76063
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:874b8f98fe07ad56f9e0971fd234702695dae21e0debe0bf06ccc4e51204eb63
+size 1557766
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5a48e997be60f09b0f2e7bbf9623b56a79f00dd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96d68dc8aacd977ef7fd7db64eb5c1e467b168d06fbb5597789f8c04e8c0bc6b
+size 1922168
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bbfa6cc1210583f46f360dc7880f6f7cff997fe4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f56968a309daf1f023d0955e6e2cdb7a69784747e40499645a3ded8b44d0eeb
+size 2295529
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c002196360e433edd47dbcd72b1fa9a7969bed0f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b37c294ee4b80cfd2ef52e636c5afac91908dc6609622ad2dad3442a3d2ca2fa
+size 2661028
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3f16f18dc9129e3a1f31bef4addefd3b2a0a0e89
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_challenge_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f05e5b99ecac710a2185b236d4208a286b6c4469b57c58de20649b851efc8499
+size 3027891
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5cee34b2bb2b7f2b60d07855cd1541ae05346f7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8f0639fb8506707e7da5f51d8d59276c1efa8c8d91dd290af65d39fed4bbe4e
+size 2351241
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..337123656f3d88dc70101b8d820c40b25ae2a88d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d864288c89d6357842885201b6ff55053bd716e84db00f35613b5492cc57278
+size 3174164
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ac3c56789583de9ced1b1a68781d17f25fb75e90
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96c68b00a72f26511f0ff0a316018d5f64b91977c6337e84ccc1f04a8e4931f6
+size 4008946
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d7144af70472746dd67560def4f6586c94bbc153
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a48a6a22a4378a3365feab9fe47aa83113657dca702ac3c059bf3f46d2e5b482
+size 4831612
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..18d051c74f8d0424ded82079d14e6b92facd3931
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:449656acafe44f194bd0618b110cd97e4ac489658ed0a6d21886baa7ddd53703
+size 5662356
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..351d2d5007bb63aefe2d310e5a4fb8ca27691141
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_heres_a_problem_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d83218fda4f1744d1fe9d2aa4476f784cde31c25a6864081ee35e7bad32fb78
+size 6494729
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c39fa3a031637f75cff16195ba5fb6a7af564235
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58866af205beca9cad754b0d31eaaf2e70a6314dc32b2e8c2d876461186458d2
+size 2746051
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d36d9932a12e16484face099a83088db055cfe1a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb755674722f6bda3cb89a275da6a85c0d7e3092ce6402b5831d23dc617ac717
+size 3649359
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fcc0ea75ac9482e798d96d27303b15db1c7ef748
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92bdc95a121e8377deb2511707f81a5743cc788c12989dda3c1622a7ed195170
+size 4567167
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a749a2e9ba9d8bba0c8daad1f02030b34a48e981
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:013db7bb89ff8893548775af8530bbba213457cab0de9b5cd13b4477c4b47278
+size 5470654
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a98162b35ed22be3faed457b773e5853b99f918
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13bec5b0957dc412c1bf0142d5ee4e49c08dfe53100a9e4d4c575fc3ec085b61
+size 6383239
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e1bdcc1fa5567df5efeec2800542b9babe0fb6fe
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39cb8469e4707143cefdecbbc9c3ec3805086f075c11bf520b6568b2b37cda17
+size 7295618
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3ad245461e5daae53975e25d4adccb9561d01e22
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e93b1c73d4dbcf348eef8a3f51db07c242114f6865cfdb240e8e474e9da7a636
+size 2843984
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ace379d110486ca7e2630e83c6c5def6bb41c078
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba0b99c6611e12e5d95a7c7cdac8b4f34b006a92a20d963060082c028ad284b8
+size 3794906
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2b1f62cc58b8c4a4518dd619d44b94a7c6220a1b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8e19d54fa148792513dafe1166348dd893602a2d84f3187deb94ac96179c0e8
+size 4762687
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e3dc86db79472ecdc941537a4799bfb3ff762cb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d237ccb7f578b730bcbd050c66b06977d3cd68ea8ec7f0df9370d05dd8f024a
+size 5715969
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90b43920d5672e6dda9843dbffd311b6d171a1f6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:691ed125c2849b80bf8b2436965920a0aa2596e5352efae5808986a697d3ea9b
+size 6677992
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..be684888877f1e96cf6028f5e4af07af5b0fc3d5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_multiple_choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf3a35cc4ac286d632ae1fefc48cb18c9eba165deacb2b5f63f4090fe1c03580
+size 7640462
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f114a250a13ee2a7c0acad3ae8ea8adfcf0b35b4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64175e6e3cf0d12691b586541afb2d976e27e0b1be8ca99c7a1f47c8d70e422b
+size 2322732
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b75b01ad63e5ee755358b84e232be5103e9fbd9d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f05d6c001cbb9bdaaf304adc97a581e9e2534a8946b7a7c09b3957bc13a8cbcc
+size 3110021
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..090ff8dcdff1cafbd3ac7cfac190fa027a6cb653
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a9319a2bbab75b0e916a1f16127341ee856f42d970da21c08690b4175af9c67
+size 3909160
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7f7d602001c00b0ba5e99abab5009ad02a56fa2a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:655db3823b704a70d8d7125894593632b30c7a41002bcaaf2a4aa2dda03a7e9a
+size 4696184
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8d22cd15f1e9e399d167ce84393cf43fd1b3a82
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b4f3d3d4fe22fdfdd403cfd358f5f051474fecb60daefec623f95fd92c590d9
+size 5491290
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2d34046939a42a5820144fbf9da6b9a3be2b3ca9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60c3cd51f589a9c1218869a02b67e4714a8d3e223225916cd14bd651cf11978b
+size 6288023
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5d12e7c27a6e7a028e4e228dee486f1d8b70b422
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f10fffe453332cb3b0bfc9eca8efdd6da57e9913f430060571d1dd62c3cd72f
+size 2196896
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9fdbf8d4fe8bcd8c280e5859bffdf9e7f082e4e4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd727e0378e3849c1abada8bd9ef7622773bae4e58722ae974dc48ffae60f18b
+size 2831982
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..818b4189292b666015a4153d49be07cae1dcb0fc
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:442d699839da8805ff02fee28dbfb14433b37e8eff0dcc76190fe56d48369837
+size 3481008
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a6e25cc30b82d13bf8792e9abe030eac7c0a14e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1e8a4ea002ee19ceb277633ac3bb4e559a9748b564d9c2dbe9c7769cdf45e5d
+size 4116029
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6e03e00f242f5ad614d23edbf11616ba79907f02
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:419bda7a27c2d46e47bd3b3ccaed4d23f4407d2d05867f56a363fedb64b8abc1
+size 4760162
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20f1b6ba9d4cd40a1c29839a52ade940e586b0fc
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_arc_easy_qa_options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe721f1cb2cef5675cb68ba1b6f26b77f769f7ebe4a247d55275c08083125cd7
+size 5403987
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b960683e3084b0e8115a84e13bc5cbf6d7fc636d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf9184737353f5d2dff55fa36bed719a95c7201e74517324e2398bd89ae6f4e5
+size 3642262
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f585bb17f83e1c4d4afac58aa5fd6ea9fe9eef05
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3abb23db1e57c23ee5d3c0f59f3e96a796158a0a8c60dbed66fb7d1fbe52935f
+size 5657443
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba503fe8eb3e40345509a3a50f4fb8470b4b96a7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:521bb9f6f33669992647c7bede60810cc9cd6a438e0402d905b188f0fd65262c
+size 7694611
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..748de9bfcc82f6ed14ec4cd99dea7caa9edf5a8e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe8df1a4037e0195dcb32bfc91c5c00c9fa85e471e97f32d30dca77499b0164
+size 9728764
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bf51a54dec5275cf790239b92e07210ec8472e73
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecc3c2c3cb4b29c0008ad4865881c32d29a0ccfe3f01ad455955edce6de48f2e
+size 11769421
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..11fefc063b1f26cb16563472607b20f871dfcaf2
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_GPT-3-Style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d19e231072afca29365caf6b9550e5873b7bd8d205208599a037215a567b551e
+size 13790946
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..98fbb472fccd0b8fa1c3d3f87b8186a1b2200441
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af87518400cd06beed583f64b5338f7585324ee59b1961aac9b2524ac6ededd3
+size 3984918
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a4907fa0f6fad03f585a5184f73500f6d6354a72
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e632237bb0a865bc37e5603264f03718b2e31e6cdd9270a4cecbaa6bf71d9c4e
+size 6168138
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f86502312137203f113348cc94d73a5f708ba9bb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9dba1ab7760d7868c506ee6ea661fd62fff3703e4f1ef71da48e00b5c00569b
+size 8372456
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..63654edb4d94970ea1a16743423722cdd5cc792a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56dec3e249556ccb060f54b3c2ebaf5b9e108e949ac396c6c213042edcc71467
+size 10574089
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5a02ac585aa8a79322f265fe7b74be131018770
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cdaaa2b70841a66bba848d9d03215228a61746a3abde02aaed6081373cabbe5
+size 12782159
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97257b18b4344015c3c55d0d70a4b30a94b6464d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_after_reading_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:090dbf2a21c5b7f70922702eec1315739531a4fa3868346aefb6fbda6625ba18
+size 14971113
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0d40d207590fe585e21fa97aee4bb270de38833f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60753d451d8b4c4fbee9fecf9a14420c3d6d35449c9c791d1b72de95f2ca152c
+size 4041656
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b070194ee4e7ee95cf0beea26143887635ac7daf
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1ee1872b73b29dbdf92dd33aba7f230ed1458f0bbdeb72c730caf05939471be
+size 6261122
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bdbbcf94558e2fa38031af1835427d3523d85439
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61d947b97baf757bee23b39f3861162eed80bc8c69d137ee28258ca06f4fe3a8
+size 8501646
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..360f9a996305fc875e41de5b625454469f22d570
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10bc0e4218aa30893cf64eb0225025675eaed9b103b0da3c67110b4fe8e1af1f
+size 10739169
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0384190ee6f99e3926e0b04684cc3c042e9d5847
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d064dcc5126726ac078d5385740f8d0014903e51028b8d39c298125c7595af61
+size 12983161
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..27b5f3b780d3c155bad5f29a72baedcb992ed280
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_exercise_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4cdfc3a5a114f2a0eb16fb719255da0eef316895f9f8d4d7cb3f07c820a76d5
+size 15208049
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4c9d701d00f908e9c1f4edc2d2f573a0fc358805
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb1cc7f2b4b41868bbc1f18faa113229313e32caa9c4b5a9ab591d8458bc40a1
+size 3665661
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..969c4686a62a82fe03ad0fd776534c0f427b0408
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e0f3e26c76dec5996d96874a019c80999742a294b7e47b3631a0222ddfc8010
+size 5688132
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2217dd1097954f68393c618a81ab2231045bfa41
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb381f3ee43e8a3a5849a7f55e84bf816b1ce5f0513c0b5b82ac40e6cb56e381
+size 7733370
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3bc79390f227f99d9327c0cbbfe7b48540f630c4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f7e64896b961690aa0f3ad50d077dcd2f6e13ee9b7e49e6a6260011b6004dac
+size 9775929
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a3807bb7bec11cafb6f1faed24ccfff3d6676db0
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c212fb19d5c576ce0045ad4c52e9c568609cc25d3bc1601cba101e7784b42071
+size 11824935
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7f6e24fc7d7e549e1866668d26353ef8b96deb6e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_valid_binary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f6cdd060df2ae54c14205060e3589fbd23df412b93cdbd85fa8ce3cf80183cb
+size 13854862
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6b9fbb49031c291c48ac120271b7cc4e01d9202b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c7ed13564d9300d9b26f909a1860680cbab72d3b9b0a51c9d6c254785478be
+size 3862346
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..53460feaa5a91b8b00417ede48d79f555b9aa99d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8760731e451c8678bbd507eec4a22fe63126dfbfeaa9bde1906323373b6b22f9
+size 5990493
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c6780c2cc18bd9bf81b2c8c3d84dcd4d366c15db
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c8bdfb10319f9292bdcf8662b19a850cac588bce1606796170418072c2e4dcf
+size 8142138
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8d46df376a09f982da8f40185041aeee7bc776f9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48bab6a5b84c568973267ac42cdc69faf4cc637e4ad84f1a5f0132a508a3838a
+size 10290362
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a6c21e69385d968e5fde9d04667301257fd99540
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0fd21c36c6b11e8a5f9756b155a873b9758b4a2ba0cac4561e65da9902c5484
+size 12445110
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..18fb602fa4bbb9c4aa7b0a59726dc0652cd4fb89
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_boolq_yes_no_question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e51d948e979a9a9500f88b14d411c7d8d02726d7caadde0bccb87a1145cb0f45
+size 14580679
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1f2c1e096182271c4d13236b313dfd780201a644
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7083ca49da78d29f02f1ef36ed4ed3cc2de1dfc38a5efb9e5b2872dda4d95199
+size 55155
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8fb51b245b84aad009a2c603584cae0675759bce
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aaad53827ebb98d45692decd7c38e9986d232b06dbda2f58e84ebe89e80f2188
+size 77974
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8244232790953a4e76e116e017e374faa498302
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e509ce261402cf63399038ca7d93b73dbbbc5cece1551051026bd514f9bcc3ff
+size 99600
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..69a1333c8a0d87b1d0233b8c44b32d887626f740
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5916980dcd16b0f908af7ad094126aa97b17de7c53b1142bb3d1235e3c97bb8
+size 120769
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..98e65332fab8f742de320a3b7e52fffe6d4f1517
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a5a055c4182d9c8d256b7bb064819cb5cc447d31dcda76198787b8a4fb16db
+size 142894
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..145e9cd99a346e7c1f9c5c193b607b7ae7ef98b5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f41e379f47fbfc2b62467066bc5a6cd75df29491a1fb08b97426035634fb694b
+size 163730
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..baa4f1bc84e56c23659202bb95395d0591230ff3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b36e53db6ccdfb1f6b32a6464287df44b4022a292489674d6af7c068e1a80ac
+size 66233
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..05e727f552de7c8f5595e084f9c4ab9b04849c03
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9c0226b8c01919ce546dc31266c1d747a99ddf5a4fc3b08918c855832e043c
+size 94141
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f703469f14a76b44a188bf300e58f2d8d5470f30
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a27452e65514f2432530e5f9bdb7a28b160dc1d47401f4ca4c7f784df93fa221
+size 120842
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbb365fbc1015ee0b4f42f5da7af5365eb4f3a13
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c97a11862089cf09eb86a6cbdbb92d82b22fea009fb8bfc9532391230c711707
+size 147099
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cca6d733559c749119802638474acfdbbb91f1cf
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e20520c5658231a969790933c6cf9abc1420348ad3d63d31e27506010e228f02
+size 174305
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6b8b2c7849451555f7de47173b314afd0873016d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dcfab7fa2269202ad09b6d020c0528a3e9a51a2481937ccd55119b2768b4e42
+size 200219
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..427de5086a98ef67758d8c7feed41f67ddb50e22
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a4a92e3a107d269b2dbd9f941d4f53eea6c5c0040fcfc51cc233728d1efd329
+size 56290
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71d915e82080201bc20cc8f3431ee44bcad403e6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf2968da46200dbb73776d530f2887bb97caee5c1a661def8839cf56ace0044d
+size 79780
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..262fbd8636d81abc1c8223e3a6cbe7cd355009f4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:451688bb8e85b7e4826bd65fbbb8a34244bd1d817882b672833ead5ec27420f5
+size 102076
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8ab3e422e2bc1f7e69a0cd7f9b3ef53d616a43f3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d2648c49bbef6cc177a05fb5a1ac46360c61bd46f3863c884b3b09ef057fd86
+size 123885
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9a42aaa39e3c42a722c4a2e4c1c4c996e707b627
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06f872288d7d261f508412f09af4093c12a671aef2c998c86b0e376218dea0dc
+size 146658
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca2cc8684d258cd73d3c8c19389f4ade8e132348
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_can-we-infer_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e49acbedec101cd0a3235609562f6299d9dd4110f7416a3b46f6543e1cb13ac8
+size 168142
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6c5f26316370ddd6a8b6e399d7b026f422a74114
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b709d264f693aa8f53d32977608b97b941400bb054f2894b9f021844bf964a7
+size 63899
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..096f4363561ac797562fb5783aae5d3c88df1cc9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d567a3554dd0823a11fe1ff69a87987aae4927e5d582f9a427f7c7b56060b83f
+size 89640
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a0164ca7a6c453e86526a9094d80518914478239
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18e9a3c859749fe0c345aa0764bd2676e1b956cd4487d9cf76e5e6b62e03d916
+size 114105
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6ed83174fce5a7de3a3c0824e08d87ebcf93b44
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7506fbebd8695be7db5cd70cbd74f324384cb8e4bf6389f400831c1424b5b464
+size 138138
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1d3d8b52e66980f3c730c508093881c11b5797cd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f70e9c4bb3dae2024c3d11f23388d48d39e559e9e79e942c3425aaf0aa87a1
+size 163140
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6e52a4b06d9a1fbdc594dcec82972b5ddf5153fa
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_guaranteed-possible-impossible_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a2b4430e19f84397ddcc7a05b669caa72ffd29150d5e3d67302b8ee4e985275
+size 186810
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..157eae5371957df529d156b89d963fa30aee13c5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f87fe18d8693e9ce092607d0c9916d503f81889ebf66d1e63f350ea8f77939be
+size 57287
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..191930176fe74cb5fbc2742e8106e1eb995f30c2
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fcf95aef3bf1fe5be83db59d096ca2b25b881709786632595eaf5c43d473e05
+size 81124
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..12226747db109438e69c78d0023fe3d3145e8077
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59c5ac514ecdf621e6b635ecd325ba2bbc41e56cd8924695abd4f34e4a13b09e
+size 103751
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b23af74f6564995eade119b21eaf6a8197c225b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87e700e9c2d49b3e3228bf5cec552babb28c1bd3914349f189095776784e5543
+size 125903
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ce260f956a6bc0a76c3b4cda1f0ea67e84ea4e7b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:351c9d2f2d2a185a1ad19bd6d680e6e67ec3c11d885ebfd26551113ea21657c1
+size 149014
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..17747ad6d7595deb33772e3f3ca3c7572c4b5bbf
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_cb_justified-in-saying_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27ba7a31138a433928057a64d7cbbea4de992e49a12c4bec2e1b6c4de03530c8
+size 170834
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8b25de71721b68ce7c7772a3543be6dc7efbcd2d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f918e74c7abe572931afb75bd01f21500b7ba11e1d56bdc5db77e97645d62ad
+size 92170
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b06b60b6b1fdabe5fea6a05f8cbd42e5cce1568b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f5f18f24e581d29a1e9260ad715605d6438ddb451be816d83bea6899a6c6af1
+size 111554
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d8344b061b61caab8eddb5b5223a7082bca1ac71
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:085ad976b1a657a73acb99cfa00afa79f0da2c6ee750004dd2a7712140d3c550
+size 132074
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c075d855efbc9b610df656f873de7d83893de10d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12eb0a9d931e97e2e2f263ca82ff79505d1f9ed4dbc14b33bf2ee7a50507e972
+size 152234
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..680dc57846cc421e3eda290c2d2be1224b036b80
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7065eb96c029bf7f319ec776491bcd06d4ec6e53273861e7c67b92a667b8b0ef
+size 172049
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..86cf747d63e00ea967d5bee7976642b2eef2fc84
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_best_option_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a5c2f2fab40257279a816b987daffda5b13b3150ab38dd5ded90c5e072b4fb3
+size 191974
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..879fc329ad219f9da2196922346185a5ee3795c5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a031654b7b56eab692df5e528c4646f4e2cad35e285bd2809213d8ed2fd49ba
+size 87860
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2d9066dd5a303e41a8f6e1ab8149d9d94a2bdbed
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c1504037de2a156a2444c93f7eea6fce8a78765ef7420fd402f49417d10fd10
+size 105071
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..70490dc81f96c4bab720a88f9c4c5dda5b17dd2a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94efe524a8cbad3bef12fa6f50672e951e8f348694c405c98a5c1b6f5e7c97d2
+size 123427
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d22ba7aed127d6691f515d27ab0bc5415ad02e7a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:319743200bb7b9772bac9e1a7c7b1e6b5db77d08a28c2e3cac212c9680e464af
+size 141425
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b85b6cdd28916fb1ea7400149af97e421ca999de
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d67494f88527f7d49b455e41b29e2b9c4b8812d3ae68af1c0be19a0787f7761
+size 159091
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..47e209c1a17f3fedba7da3a8a4e55a10d9637a79
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_cause_effect_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce8219c6319187c10c1e7e1478c55f3718a03603e9c244a83951f956b16d67a2
+size 176913
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0277a5f961a1216cfd786b92c5bf435f6a378fbf
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb5f72919974e84c923bde43c4acee8d7817a22b4e3a3d152812414bc2326160
+size 85203
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ba6d217a3f60d7f6b2d0619925581b3c9e9c9d91
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e930456d639bfb115249c561e684a5d0a7056fd5c01ba72b91ae981eefbfa0cb
+size 101240
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8ff22de91a8938731e4f61f68a468d93068ae02e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7919cb07b009779e8757f7407c9b26afd8808b6c6b17e72ce2e88cabdcedcf7e
+size 118482
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..edd4c5e31e0f89954058211186effb25f88610c9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:299501bc3e4a105ea7e0bd7bc7b91bbd688ac0a533237880b55bdcf2d4abc552
+size 135382
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb18a367afbcc0517892fcece4969bf591e11cb8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b29834fc089701dd4c33cd8da1e2f6791b07ec6369a57cb8e4dad81ece16720
+size 151926
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..51fc6241eabc49db3d38cdca3fba6f14080bea30
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_choose_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44af90a4a312898a034a71b762140ad3b4c143043b41ffb46bacf0c421b0f0ac
+size 168730
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7e1bc63a3132e8a470a9ed2b9c38b3b799181f19
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1efbb03d3fe46db704a409816f10b9be20d08a66550a58478953077ceea0c49
+size 96766
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4546c851d9663954a900a0eec02ed21ac8118c4d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e22189c4654cd4095e282905d45a6f892339128362886aa69e22162df93d4a7
+size 118255
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b08f8c62bca85fa06e1426c9cdca590135b855fb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5926bae5bac16a96d3607b7b6aa816c4676dfcbdbf96363e9298c9aa0a1994c
+size 140927
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b7b7dea68f5f295c7b533bf212b28448e53cc682
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ca14a4ae8f0f4e95be484dd97aa0ab7a1610b74a31553ca051e3c2a2a744e03
+size 163213
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71c2f6bc93c8b5f076187caa81c6fe0dc29d0a65
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccb6379e46af525bf228fa06c8f9fb9b43e3a22c2166375067aa8b87c6bfdd55
+size 185172
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab05a37b38ec9a562cecc9a46e1e2f63502f5e51
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_i_am_hesitating_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69795349ebfd2d9f2459ff8d8876a056eb1eccf16a2a034430c25f0c42711bf1
+size 207309
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90ea2bda27088130666fbbee91925b648af7942f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14f44a08abe77e7a70086e63a1aba7927c3ec369d8f465c618a8c2a8dda0b31b
+size 95780
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..094d85876a61954c6d6afa2ab22e734296d5cdcd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f482c3b4e0579ab72ae9c04667b7d8068aab6fcba51a2470730b41287e31c24
+size 115659
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..69ea4abb03473138ef768ae89cef7e3b5498af74
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c954b7a4c5fe53a3684c1fa72341690f83e689317f0dd99bb413e198f5cb0c9
+size 136674
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..73fac5f8bc3bcf48e08405895d266a31f379f200
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:442ebb428a0091a41d18f5f0b4e1058ba4b6fa8180a68126ce1d2c9efe99ad9f
+size 157394
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d4a220709419bf5c396ee5c895f3e9fe1df69b7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:208b07f420b66782bd56250ce19873334553e6a0a190758f6babd68e473ad2e7
+size 177730
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..240919c5b429007ce0c11d795df4763389bb5f32
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_copa_plausible_alternatives_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aabd38f9a5c0e7b6194509c0f591102a780832c585ef2ad5964752b714322816
+size 198348
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..61d9516dc316f3b53be809220fcf1aa3a25596d8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e61d87985923ac4e16498b15d4621c0cf1eaa605bf30dcb47865db0e25a9601
+size 3308807
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..54e4a4dab43d2437c46b424c4400741187791203
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d013341909c942e39bceabc353859e8e48bb5260842d2d0b4da6bed5a71b733
+size 3933352
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6ca2b7bcd5390c9863bc80b0c7388505bd594bf9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc09787c0aa542b140abbbb708dbdef9edd5141699e649fd9752feef9186d814
+size 4891909
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2ff3c9b2b7343f57a763233e65255fc4d65f099a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76565f867508ce457eb42b9bda4a43f3519aa8dc40e7cc71384ca0f43d6cb57d
+size 5835142
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a2f5bb9a1e89a5fc863ef84dc2baeaa8e7c3993d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac7058e79f33a8784a8ecf4f5bc58a265f265baf59e7e8d10ad262c156cf745a
+size 6772046
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..385c009294d11f19f32560a6e05da402816d55ea
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fde63c7955a9fdda48cd71a6cb4b81ebb64bcb5329d6d11494c0f8651eb3a79e
+size 3418675
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..993c70975e4e73191795c1e6b9d5d625e41cf6a3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68b82b94580c2902259f4dbbed0f5bb032c91a06f17de77481aa946fa032a8ec
+size 3863372
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..caedc74b7374d5027c15aceeb31a2d5a6d13e231
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:907da56a8160dd1922270288b5c4bad7bd615948c238687828f6a1092a5808fd
+size 4792633
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a5c1cee5a7a6e066d06b20f92ea41aa008d851f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:056f121766991592b8746d05d0abf05ee0ded46f66c700a43069657741346d05
+size 5708141
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79a60a52b6d2efa4916025d8828ee4275df52c70
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:309054368b5d4ae5cc55530adff86b8e44c3022f19dcc43b40e1a4f9caa75540
+size 6617573
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0eae5a3a85ac29f2c21142f83a19cd3f1fe2c4ee
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81d730e7a07c324f47ce6db65dda2d893b04f40179ff8645874849c862336f9a
+size 3648305
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b41294d9266aaead494261bd5fdcf6cec9ff0ce
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df13f6c83cc81875049eae7fb00f40c420625b2eb29ba803ff13f9d7d5af83b1
+size 4949539
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4775c6411f2b56e954338533bd35b9f8c8ae0c5d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2954b1fbf99f218381ceb4053d5a8c370306b269bacbf53778acf4726d41e7d
+size 5986326
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..000170d8eeca3b3a343986f6ef357f43ba654716
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e4c43664eadde44e573cfe81014a804d2bb74d0626ca120e6e99e11c35eba51
+size 7032480
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..80af528120be3a90b5457ecd5b63c77bee61eae5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:476ca01927a3342fa57756263b83143d5c257397ef0f9265f8be939bedefc2f0
+size 8085486
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8c061848e976c94361d6b3d011ce58f3e7fd3a96
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab7ad2fd4b66065db20eb44ed01f1db245bf4880ccddc22caec33e1bc8198db8
+size 3986049
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14d7004960ddf9c34b3119f80c5c29822839846c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b292f0fa7fd8fbe70711ddf34fa6f54471aaa732fa658b8443956f3bd5995844
+size 4993570
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..024b1bb48d30460b98874067cc17425c68aea2fb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:962f45a1168ec8c72f4526d19dc4d35d05af5f841a98f6152b6b6b60c79f3db4
+size 6086704
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e36b2feb8449c736eff9b89be441d59ecfed48e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38a3e2122075fe7468613a8f6811d19d220e8d1684da9b57dd6a35e9d556ae45
+size 7177098
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23f220dab1a91b28a1705dade6de372b8368ee07
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:053e7dc588df7186d73b323d0f535099ccc6321fbc7f8756124458a4ada61ed8
+size 8260563
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb7fe7ea1bd9016e701e27c822f8149ce11e7f19
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3627a6c999e40976f4e3f628c5b494d34c8657b088b01ce67d96e905a407f74d
+size 3086155
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9fe3889f1006ff48518c74653032839d9bf0cda2
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10995a0ed5747067cec9857092847fb4b707a92577f9469d80139d2b0a222630
+size 3451667
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a13ac604bf9982117886463721cd63ca70f744e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42d167c51f168793c3d15831e5e619daba104bbc6c2a139ebbf97a78a377300d
+size 4252536
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..481e616ff9257ac2b98dc0763c766a5a55f95271
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9559b86486606782883c54c8a3b4295331e054ed75169f0a0951041024a4bd81
+size 5044364
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ec08b7be5cdd07544abf5da36e8ff7998c82ac95
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_e2e_nlg_cleaned_text_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6eb89400b768da3ae48805901e60429075529cc74d254a165bc7ed1a665f4749
+size 5832723
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e09e083c192940049e6c06b1f7733a4ec7421a3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:977f1918f0762a8604ffc137928f1eb92abe4913192b5829a1455327409a958a
+size 2891047
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c126588a3c0c4f83b41c5937954e09230b6bec77
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b19deda55567cbbab0bf0f22563be1b69586111bfc584a3eaf4dfa2d422b52cb
+size 5090575
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..10f21f3196490df97d46337865cf5f2bd77ae452
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8586aefdb93adb04133aaaf2c6fa3c3441fd89dcadfafad9a6db8faa6f8b533e
+size 7318314
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3d6ff5a46f1b5bc595040371c1df1ef39aafcde8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4794189e9bb7a4074ec092c8a2bc23d2fd7806fa1bb637a4771affff18b6d60
+size 9604919
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0262bce37f263f13ac72af888b1624ce203b9142
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:729a330c998331336b94cfc58903a7c28852c28e52b2ee98d07275083d3c4335
+size 11784134
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..30051cff36153ec1774d4a27875c7e919c36addb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b5099e273a176b30e252cf465c0b09aea234a62ba383e5c84ab354aa5235a4f
+size 14076790
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eb09088de994aa68680b8b8d389cf00473aaa80f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd99a80bd87c051bc1c87224394c1ef77d21d801acb9890b972bc6bee7a4e9cf
+size 2777731
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3da4db0ca293a3ea7ea469aada2552afdacd8340
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61e964846ddd17015b698bf9fe42f44e22c8d46e9b248da5742c83985a9f099c
+size 4956472
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ad7345e967c563b7191855cd1648cabf0d93b3f5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b45a8e95a016a6a89d48d4b77106aa0cb5a60cd4c1c6e661947b8bf049d9874
+size 7114174
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..558b1f4e5c9fd9adfe36a150373b110055863586
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e447506f171a1a427bc6663b6c47ff40bb25a0f83a022908332af26b1d02084
+size 9371060
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9b73f98b1b6b9497e1429acf88d052154fc54f48
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3df3f3b8d8139768d110a5232d0e1cf005d264cb1e8d8aed98fd68b13d1a5fe
+size 11533527
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f609471e08306cef4b34840084328f54bf63a0d8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_DOC_tldr_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34acfbd719fc1e3e5ab686202287ffa66d0150d43f7081d0d728001b98075cb6
+size 13793401
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3627d4509277f23540ae091124ef6ccdc2cadaa1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9225c64dddd02387b96a53927b16eafc826ba15fa1feebfafaaad95fc32935e
+size 2792207
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8d71c6d25f5581325fb65d05325cf50c11a4d02
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afd877ec884eefc18982860544e957417de3f7dd9f698f1e7aaa04dcc32ea931
+size 4985035
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e64374938e8d617a4277adac9a3e3326eda55215
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b7a047d833d4e0e79662215434103afd53baec6f6936383f51487952c5a78ee
+size 7191499
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b4e19ec518aa4d4573c30c901ea93a490070755a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f27d9e4a6c4d9a782ad6eb79d2216389bdb2c7960809b88d870921d1877325e6
+size 9458314
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f2a746f2aecd58877986d2173631404ad9a69802
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6046c747fbe2cb2e2842010756f5f747dcafa1fd4a73b5be3d6f7dcd139d7a96
+size 11628262
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eacd3b2e841848b23a6d55478dd0ee191e1d1e33
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35859ed75e8bb8038dc73d9f22e1430032c8dd49d7bc982eb10e37626e748328
+size 13897472
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1f045527d2e315027b3999fe87f3188275569f79
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91d9e85e26de0e3db2e3723799ef7e2a47f1e89e5c270a00b7ab86d3e95ae9b5
+size 2827917
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4047b2bf148c57444100be6b5a21c49076df0c3a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6730bd25df70eec7fa624824c967b21a3b900f3b9365ee4393e78fe7af0ffb7
+size 4993718
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..143ac477091d110f6746575c26b4e24cd4d55ed2
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d321f7bb5dcd2139f501f73e14e2e21c026eef47d12d9e8536c3f93fedf34e9
+size 7194945
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ccf74442f874caa0b3c6f838ec7c5a08f9a55dfc
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f453b0345c1bd79fdaa8dea8e3a1d7d7c9be50aa4dea5c76f660fd9ba1c604de
+size 9459226
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..270195c6e104f94000cafc5244627eb29bbd4d1d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f00a80abe729f40ecb4bfad39a689a88498fc4939930f005c387f79d79349b3a
+size 11617943
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..70708974a22685044c734fd7e6f8a27b6dc62fba
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_DOC_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3023870b78e48167d47bca8e600a312c0c6970536c43a0069ee6d091a5f6bbb4
+size 13882465
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..41e3043009166cd0127135d53e5d3909185e8d9e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08e225cada19de03c607ee7d1e9e1e836aba94be3c9581d989d25eeabf3b61b3
+size 2875641
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d57f033830339c62c371f37f8c8f29c46462fee0
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83a6145caf8c69b78245f5294802ba0090cfee55f7e71cc2137e2af61b72cc17
+size 5039924
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a982c652617df6f26759c9da3fece8b1ae70670
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6cf72a46c48552243ac754afea8b4522fa4be58ad9363f2557de18b762f563
+size 7288614
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..84a6c4cad52a1ea7a12d367c722bcbc9808c717c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cea28cf44eaa97155da2ca8d1d7581572818503ef9ba3c143f00136b34447f9c
+size 9567917
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e01a55d50245fe315bcd9ad2f89900bee21130e8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e512895ef63598c2688b8470c03c635d29ec72187be3b0c9a4e81dd8c043f034
+size 11736756
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bfae56479dce76af0382369249af8ac490a729c7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:413b9ff2c8dcd2f62851a68431519c3cf49bfa3bbb50e468a5a4d04118933402
+size 14018667
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9a6361b4cfe835d8d72d546ea4a13ad810b34a2f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c664ad6c0e0e44b7363d4a6f7a04f2a08619c1440122c9745f9a1c2302dac9a1
+size 7126374
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07744e7caa3e2b4bcc042680bea1e3a267aab1f5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93674b763788e285c09042fb1ab7c7f69ba232261040b75f412201ffb8ef32ec
+size 2613849
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8e07b7539e71c97b8bc015e75183d42a9affc0b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc81cedf00bbdf63f96586f88de41516bf277d5e2a1770b46dd59bab128e6f20
+size 3282696
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b2c41072b54fbb65030dac8528b49fe28b399c41
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdbdbe76b88cc66dfe67d35b3d548c884c83d2b906feed41c0e8f7e049d908e6
+size 3958298
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..75f111b065e180d7aea4cc08bef7648484291caa
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e463e441df8123efebf9fea8d6b432432054f1d9cc09a5be29e364a2544c4c79
+size 4650534
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1ba94991a2e76eccb38536cc196363436fa25015
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_Correct-the-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87294bb67f3b03c4a58b7161aeee602b69318eb4e9712a5e801a8fc3691429dc
+size 5332953
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a44655170173873acf86a1ed1dcbda9b24f4326
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a4718ad6dd6cede1a941fb772f937474d98a7be1a16d524a1bcc968e5889368
+size 2115935
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3c482c3df9decd346f5fc00abccee2b98a02e3ee
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97bf18b2f511bd1d461a4f115da409b0aa8d1d71c9f6a47fc844359cf871a4c1
+size 2900909
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d2c462264762c7a155c1e4dffdebc45e9c5cdae3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce8a037dcf76f2aa5ca31a181cdc2fdcfc54792ff784d99ef1d4e15df5d43b45
+size 3683911
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a41795655e5e440fa3979c1286d19eb6afb3135
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f788691e9207c66c8427e70f3ad9f85f96a89ec3d040bcd581f3481e6857ffd1
+size 4458372
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..333e41be9b53682b6deb12f1e4094c1557b542d5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d4a3235d42f5c1912f8f286edf788df1d7e587c37088f8f6ec7282bdfc207d0
+size 5254516
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e47e2c8e77cd24493514dbc8a47304b1855b71bd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07abd9fda417b07b083043550690887b394277cfb5eb4337abcc75118f8b61c2
+size 6040677
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..05676e7d44abfc77264e904e8a42bd14edd16742
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e23156c62072e299a59b9d71762a7543976313cb8b7337f050821fe1e43c4416
+size 6012608
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..30f7e1f745e84dd3260d6580e8a40279a7019458
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f36655f017641aa0185fcf86de9434ca2108bc3248fb7bbe1da3db9077669d2f
+size 4665106
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..240c2b7e7759119976f3a73269879dd760591d76
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5a86f9dfc1b8eb6dd8274a4b78c9c1cc961065f78d3b7e41a41569613be6e3c
+size 1853670
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8672c3de89bbd8064440853cec712f7c806e2b2e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2220a761c1a7ea40e9b00f5af42714eef3e26aac994959a13c1e8a62a5fd355
+size 2108248
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..209c77a7d16aafa65f141b0036416647f7fde7fa
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cd135c05fc7cfea3ec0c3a9742930de436b500ee798b849e304028a5397cd6d
+size 2384640
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c76565f1f5769827ea1c2c21630314adb9daaf7c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_no-prompt-needed_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a49419d142a9bb9a3503d45444441a24a7a62eacca1089d194713399352a1ab
+size 2680646
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..db38eeae18cb274507f443e92f90086d85bf6131
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fe6d1ec7b37bae5b3662ab78623e555447052a034a47a8948c3965986ff33ce
+size 1864129
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8eea26f1acb46fd6dae44beec009123b9c51eabb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c9ceed2a77c17d12bcae79899869a1c3d0495a63d418f0d2f759013ae408840
+size 2557203
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a7f07c8e749056e8c2fc8b67706a00dab88910a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5da340eed18eeac9066004ce0419acacef1812d41149c4c75f1c2717f2fb8221
+size 3248305
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97141e967dd9b16f83639633c24d163ecb322b52
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd51c4f1f466c0e5968690b168b17c44242d10fd95d3ccf20bae9d69b38ad5a2
+size 3930866
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5ca252b8d96abaea07f6ad5932d8e20a59095421
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38552279cd3d013f449dfd42120048c0fbb7afb5a49d89b6333f5c11be23a23e
+size 4635110
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b4284e32f22925af52ddd5be07cad95f63d2606f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_pick_correct_choice_index_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e179947601fd7bcce7a82c761a46893904a0629c21c17ff947a29ac43574370
+size 5329371
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90cdae38b8a42279edcca2919103d6df39accab3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dab5854265ca1b092bc5b1be875f1920dbd8f8ac485b0e6016491ab717c7f5aa
+size 2257280
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a4b6f5cf1839cd433d0f5e4bb46f8651cd4c9cb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61c6d2a0a71220c4742691f6c2ca0b2531d622fb8e4ac428084677c1cb4d024d
+size 3018444
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..19f1df7acb73b338a3fc18d34f1e596ec222212e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7e4813434e0be7f01f6365c3a020b1e96ad83de4a3b250bd9c4ec92ef6b2010
+size 3774336
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..acb3ecfe0b3f028e50dcbf7e45885b5ba26d995a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bba80c6c0921ee9d2ea762cd4253935f1d310d3a7592f0c29a5ef45e8b2ecc34
+size 4519252
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d78792d59300afab6c7a0f2725d0e65137e4bc85
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d458df919a05da0a83cadd58e7d277e05aa3372dfef573a73ecb90fb51cf3e5a
+size 5294120
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e097e1ccb4727bd80cf7863b066b9c3e9074b7c3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_piqa_what_is_the_correct_ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfe6851c10e20b136f0b5066932225ff5c5050b5ae88c7cd6cb31f587d19fd32
+size 6055644
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c3300496e44aa1adf4435764bceff0714a8abda2
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:185233874264e165cb97649a36b3524d76f82d918693290c8dd8fb412e6a3371
+size 639956
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb22aa296cca182a226d5a04e8afd9b0c6dc7bfb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69e1a9c59dcc6584e3162f791cb8707898f91ddc7c192274615a11e54f68f922
+size 755003
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60916239f49a4f68ccb48fb53d91883bf9a29048
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df7ea974ef820b46913c0be7392ff1fbb6df4783f69b95f013eafcfdbf33c2b
+size 871262
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1b2b1fb2f3e6b26047ec24fcd519e83a2c218ae8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feca76e2731868760333ddf894f673ea2b87fca1e4e2587d3f6ba2e68f9ab3ba
+size 985710
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c105cefc1702f07f1edfe90dc79610718a2d882a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2440df8940b15df1c4cfbec8e4f3913f5c63fb1ee08534ce217e5a552d530952
+size 1098412
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b100bd1d3c89f1fe0110f2b4db6ee830a8cec173
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74e140f50cde3d96847f402b377ede0d3b1e7c0bd7bcdc00e45f60a1193869c8
+size 1213609
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8889837d3ad9372c34b8270baad3b1ff209cf4d6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3e7aec61e8b33f09b0ec09925a35ebdd711443b59d6a7835f30ffc75b02e952
+size 1182504
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..120b1bf1dfb5fdadff6637bdbafb6fa14c600957
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b8c7c28b37276d1196c5a5f889bb3a98d559314494f5ea9c879e931b62dd1cc
+size 1779309
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e4ddc9aaeaa683442d1188f1444e9476b211ae4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dae190b423243f128857dccfdd50e0bde4412b83f7644d45395dd312eda2f5f
+size 2388534
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c2508bfbd9361cae3274c5c17e48da5db64cf434
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2af6016839bc7a24e76df7917ce83265fe56d1a912818d0df0f6cb1616ccc05d
+size 2973500
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..804d569a4472ebb8cb45fd068dc583bebb8ba8f1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8eeca33e43dbc1de4c58397d9ed7ddf6861f5bcef5d395c16e8a08b1b73460ab
+size 3555870
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..647e8a0819702afbac21da6de3b09438f1fcaf40
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Direct-Question_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b95d6afdc9508f2748b4a64d4f53d051adb489af83a757346aee491a1aacf9f
+size 4144659
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da794b2ddfcfba15178c268fc01754b468e6e924
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2636e1c5488826619b08f6329a94c965dc4862ed0c024ab067cef329578d709f
+size 1329081
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..27b9f830177dd1eb6908f0a445d58d1cb0c4254e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e847b1ad078dca0a17e18baad54bf63702483ee7956e21c4738413db26cbf45
+size 1524760
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..40d7a07a69518a1a181faa4c2ccec3e687576e36
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f13d6f9a037f0cd068998975d0dfe76f3f774c67d8b55dccb32fd3876226ee0f
+size 1720882
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dd4a837a610b1b7c1f1aee48abf53523e8912557
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02eeb8bfedfe40dbfa32d67836b6426160bd06a2e946e085857f51a380ae9f4c
+size 1915936
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1dac6f8718cf76dc17c477729e954afab7373f30
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b975a64349576d163d9a2ecd500fc0ae665b195a39d9901c211a7d1d8808121
+size 2107493
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23936db82e8bf477144a79c21e47162eabf7b1c4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c65df4e4d57ddc7f86931789f610aadc46198a2c1f99b9eaecd95098003c1cb2
+size 2301104
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..90e295d764d564057355953bc194542efc51aa28
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b48bb5e6ddc1a4cabad9af4a70a7aaa97891b6c6df53ecf6a5e7b6e8aaa332e
+size 1935155
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..165886d57569f10e31789728972c68ce5b2411e5
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:101ffe39efacfdb5e8c09d5a119ba197b7ca67357309ba297539c76421e87ef6
+size 2636584
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6f7c55c904bec27a0c8964c9fee0f2cab1998098
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cea90a6b2f0ac85808b00f02d81e909fc6e7411af7aaa2283fc02de9e3af520c
+size 3349532
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..29e1f299bf002c56e3bb1a940bbf33d9ee0ec4df
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2761b5bdb5e2f96bd254d477f059d9c2b68b3f7a08036e4e924b2661501b1bd6
+size 4039181
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e1ea247d0de771b2152d11412088272d5c22420e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68e9ae1fcd6552932f632a3d26513045b9cf687ac9ece716e4fe27e87e9ad115
+size 4724413
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3e877c760b39e710008975bfd7ceda0ac7120577
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a95a71bcd15b7725c4da416e4d4139a2d6bd6d0494d9960c84a9c72b8732e912
+size 5415580
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3f0b522bde370cc6d42e8f7dcaa457207bb309be
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6091071ccc1710040fb671e26f83ff566825a3a7d4d9aaf8c851892d10e64e0b
+size 1870182
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dc887b5542cb9d376e26987819e861d9cef16e1b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af63e2d22944e64788bd3926cc40ffff662878aa853b706dc4a9b42f6812cb30
+size 2545516
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c01a64a6b7092abef9422105e056577a350f4dd9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:596e6bdd0df5a041de54d6aa1005306b16ca0c343e800be9505a27f0f7b3f353
+size 3232553
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b135ac18dba9c47401f945aa8f371bdae320b70
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74ba626da5f48592825445ea0e559cc9c2265df4e982e194e0cef8c7655647ab
+size 3896213
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b3e397f6ab9eb835969843217b13eea55fbb9eef
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef021dc00ab2c8ef23e71bfb58a3ebeb0538bb23557b2f683054cc20a5d379c6
+size 4555506
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e81d41450fd52d53599cbc2eead401074308d643
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_sciq_Multiple-Choice_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a69562abfb7535377a8a72a41e49364e2da3a084ec1d6cd22a41a1fff9f9d704
+size 5220615
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cf3afd7b6930db9929c172556acad3be0e68f3f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11a922120a1c7f5effe3bb55f3cda5fed927c9e324c2b23a35064eabc5159fbb
+size 2213367
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..129b30cc38cfc9f5d18e392ec843cd94fb5c90f3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a79480cc97572df627104bfacd0338268be9ddd39810a2243d1e4f228dd403e5
+size 2974494
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39a671bf08b6a92e44ec88f90dc7abb3a29f3fba
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e704e3a8303eaf972563c6e3027e7bb50d96a5b2d149ce9e753b1cc06b20511c
+size 3726533
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5f490c650bb40036797a86f598488bfbb903aa6e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f56ffd182a9e0122d60f0e7589a8d3a6fdd77a283dfb654d06f4dd095f78326
+size 4477832
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..56dad7fdfa9cce1055ba56f0e10c683be4a093ca
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3272a0c9250e018bc6c34eed927cab786d3b05ba35489725e7d67bea6e7bd6a7
+size 5229772
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..883ec01e2aa371d85148edbd8c2db7141b3399aa
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3030256ae41b661db4740a81be347346a9cfe3b22f4b55554385ffa71fbab9aa
+size 5980040
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7b828de3d6ca0d237a0e2c201343e27a8317b2ca
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b22e3d9b0c522abe5b4852d461968600792eeb8b6d805dc29201d3869e4b065
+size 2361356
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64ec20ab507dab92ba20671dac28914e1937e7ce
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89d5c06ea27173eefe6c69d72d5cd5750abb9f6663cd5b44c6b79c522cf0aefe
+size 3194044
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a4cd25c22e458ddb9f60d6afb0dd4b34788fa87
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3939414cff6b495e6191f91c0d3311e09f53f3c4a2db44ce73a8771b492b792
+size 4016893
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a375a81080e6419ec89e9d4b5f9d3ffc64424ee
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc7b9a6d159e58c72bf9d2e481838d3b6aa2c5f4858c378fc086ccfa692eb7c3
+size 4839063
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3ddb43991b4fe6481610f83724615fd6fe1c4077
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e4633137262fb0214233caa6c92960cdc6202ea88acd82a0f7c8538342c6d5a
+size 5661818
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..10bdb5f13fac0feda3924837a0df8d2b51dd676b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f5753f505797a2b8ff81be361686bca512f383527cf87590c1fbb8f258ca252
+size 6483255
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ac1bde44644596ebf6976c7b0b28ecf2c37e8a31
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b70fb1335a4b1893d7be1c21750e284150763614dc2506598f09a193ef76fde
+size 1879043
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4214a5aff1a5d7056d05f196f4b4b69d69d5ed3e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3109d11fd476f3ec6f4cdba1dbdfd89fddf43ca2ea2e8833571725e280d1142e
+size 2434845
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4747a409103b2e2e8f68a046811fc58196289592
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:200bc241e0d30e028ea7fed7256c9576988d407f8ae2aa9609db05953630aafb
+size 2985321
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..052f4a4265ed415538582aa5de8945571842958c
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4e9d5c04281191bc15c3920de8f455a8f1099ad239422794e1366b0632c13b4
+size 3533812
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02893257f8c269a5d10d5238ffbd0094e799cc40
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb1fb8c7dbce560b8c153d230daa9f5e9f4fb1d69bbfc7e7bef7477852b1a22c
+size 4082262
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f281a2ce01ae539c89b124abfd5bcc36e097d36b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Generate-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9eda7f424454c0df0a94d24efe107d987eceb25d7eb5fb94be59ac106534c323
+size 4629304
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e7cf2d8d4b9333ac8163663b968700f98905830a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8926d7d6fb5110561505a2d337e7ca776aebef3958fdb80e872b3697866ffff1
+size 2371100
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1199a68fe104eab3e6712593432f10dd33fd298f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bddb9638ff4b86c790f70dc3e08fe539e2d01958e0ab675ec5ce8d7621bdec22
+size 3210350
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..01891538508aee4ff87a47b3517f3862bbc93770
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14177a5ad866616f8badcd10a22c96df29afb25b1110b22f518c466a68184634
+size 4041207
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ed76a0aede818380091033b12f9ab95b16fdbdfb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc3c5d770ef33e05c48465aced64a2661dc6dc3edbfcf8af59650fcfd7e3a2af
+size 4870964
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..204cd994e276998489545c19b0f88054ded2bc07
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a00850e73c5f5211143cef9b93b304affa533212c2d40a203d652b9226ec2672
+size 5701425
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..961c8b88eae56c9338e96928e9af65855d8db29d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e34c0afa23d25aa2c176f9db8ebd864d3c675100d1584725104cd4b6f7320eb2
+size 6530449
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79374a5b7a20626b8712d23fe2fe937a5337b895
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f86b01214a3ca532e4580d8894109249acd63c6e7f81feaf310e4fa316b5f00
+size 2344530
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6f8703ea6e9c370c3dee5a607eea304938c0f80a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:828631d6e487cecf37144aace43902f1960e0e34cd54a5d91fe314bfb83052a9
+size 3158556
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..86cd1f85f8ba5db8c97c955c41236dbe940b297d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4c8a91a485b77fd23e878f4bb481269c29a37d543962bcf6ca55fc83e64df61
+size 3962591
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..17cffdf625f7bae404577fdebb1b053dd6a8c271
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4eab24174b86e96932d2a3c80296a478b81d4dc48024a68e94fa91ada32de5e2
+size 4766020
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..669437bf4cbfe7b5c257fc2f14847a1f60aecb2e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88292c8c9dbc8c7a476880c9f76fa2c2f51478e54f3ce21f82b666c9880f0745
+size 5570029
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..673aff626fa42694c0908c93bdf66c33bd074602
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3daacac644da775b6af2361976d89a655ac936ea327c62b5aad3885c50fdffd
+size 6372976
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f58edf84b162bc95af90fcf84090105a4c04d4e9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e59fa33c47da6bb5a1c33ef2f7332bfe2ec37537f9396ae69a491b828ef4cc9
+size 250652
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5028cf9c43046162507108889870ac223c75cf13
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e86ff2f49192d4fcc0250be0d1dbf85dd9041967ac56ebc7878d31981908ecd
+size 351375
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fefccaa38803f565960c1dae0c4c45a17389fc96
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bf895f402bc3b89fa31b3ea40110367200e1c531717f88d3170b8ec0703ef97
+size 449670
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..766aeece36f755ca8c23d1039972485f480e0d6b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea58624a1b49068f826a921e7c3b000a44d0be616df02cc015af68ec15278edd
+size 553067
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d65063dba0bce855e20de9020221cc3fef814391
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e69ba501da92727ffe60d8619919cfc3fa295ca084eb5a4a5e713cacd8baf74c
+size 651711
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0b262fd239cbec802f0c76acbff4307b9df33b2d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_GPT-3-style_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2d4b264636ce11a5d7215a8f1973e23263e811bb54f00654b83d4f0b926e6be
+size 747629
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ecedd898f837d9681c7fc3ea6f8e481dbab037bb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3e0674694df4c36107549e8956e964751e00fcd844e15c054f6c6b2b786cffa
+size 292928
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..77c0f8d480bf954fb578d1043459d40a43a0c9a9
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a06f820278d561016b192bc8075a3b7314a10668b22209a99910f30591f346
+size 415394
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..894c8251e7659772d53a170a7a6f7c4f737c6de3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd73547a3ca87e8e285edaf53173c9309e53f51e58feedd960491c9b49dcb656
+size 535351
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f9e089238ebd95031891ec6f6616724c0679af61
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37db81bbc7c77a785a137d341f4a407178480a2e4dcc96827f9c566ee0a0d0ae
+size 660315
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cbf8442b775770ed16683af01bc6e5cc047d663e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d889a866ac786167b4d91c866f7aac2d22c8715b914cdc54648c7f78f116677d
+size 780551
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e6cecaf00749ad4da519f5b234e6cedd5c097160
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd8e6b9d5bb164b99e97107ab00657f57cd8e67e4caadd87b7128992000dd718
+size 898065
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..081e1a00b2fc90fbae2507db6d83f6e1a72230b3
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7368b6ccb0ea70c51ac0cdd2b5bff96605a4d2768c3714fc327cbffe8b30afc
+size 258348
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d80d8d080d5486448378e2b6434d7ed525fd4ca4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe60cdc60c9af2908f329c8f45c8791a99793f5ad384ea4cda56de95a9fe47de
+size 363041
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca5985995f6911f6f43cc706064abaa8d5091b49
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa03013c0dc45d49aee51ba3ef4dcf9557ce9513bb7f4e12fb8a1da4f21d124c
+size 465269
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a2e7899881b661e84984a71a8bf161d6bea81176
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37cdd85bbfd12ccdfaa1bd6a18ad929683f407ab1fd6e21c9afc6d015905db1c
+size 572520
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e69b021b53cbbe959efc874c49daa15aebe3e6a0
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef1007b09327eef861a41f2123659b85f9f961d84f68a427f751c35e4836e76c
+size 675011
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..83c1f54b381f9baf6f83ec1ea243bee0f600c0dc
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_does-it-follow-that_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7721dcc5b40421ef251afc34e12010f9b2db882ea7472776fc3b721061c084b7
+size 774785
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..dbe40ee30bdccd8417cd9ff8d16ec76e6f557c29
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41fa05c9e1b8f107157f91ddbd75eb6582715795eb00fc8a88fdf0ea364b3c7c
+size 261155
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2689d3318836b5dcbbc30cb37e82f9d5586fba7b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61227a1214183ae12588455ab074a34a19e1f8ad7f561d8cd1915462b3ca4d5d
+size 367750
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d4007c82212d383db4764e1c22591080c2017ddd
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e665dce2fe6ee02219cde7e3b9dd148457393ccf368167c97ef0995c72f056de
+size 471927
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f59fcc0ac15b5d0340a273beffc0dd460855c785
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee11fdd960d6f879e97652b1c81888c72207eb0c063354891ba2219321a8817c
+size 581114
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..71b6699586d54d36e1bfc51f16ed706a4fb13921
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4564aea4520d3b8f9b4b7513ad5c6ad6d8818e615eed3963e816e992aa158dd
+size 685557
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..53705d7f00bccca57e7ddddab52e59ca0439e5c0
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_guaranteed-true_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bebd884e53899f9c7e29627b8be04d3d29ca16d743b8aa82663b8288617f3efb
+size 787270
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c3362183c09756e77803f33c718002acdb0ff2b8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf96e1f20763a25dbaf53cea539c3222b838b35f3ce6188ab33d143437f5232d
+size 262249
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..351a660da50ad873c9b3e0569cc06ad15494b66f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d98661700638905f465e2a68a139d716fec832e8af793801dc2503f2643128f
+size 369689
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..86901c7e8d17e657f481d5f9f37a7eab48a4fa88
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fae73e8a62a0b5cba04e700b5c0ddbfb54ced245ee2d5f721e04d8ce1f55772
+size 474706
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..be37e2207a717bc745ed70eb7b1b56bc308297b8
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24d0bf92230d9e2cf7d5020293ee52a3c500d4144385d321c6cb14f450c5c96d
+size 584748
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..89e708e2623f11a03bd22492f39269ad14829a29
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d520c36c297eca113aefc71e7f27ec49cfd04e54d4bbdf361141440e3676585
+size 690015
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cc1874cc395429bd0947c875c7ca6a1f56c69c89
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_superglue_rte_should-assume_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ba19a411982f208ef3ed563fdff2a28d8664766b839941ded8e6e6061b40547
+size 792570
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d3460f6767199fe5479b35fbc1648d0c54493700
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a58c7c11e9b45feac7368d7d103cc675e37e6ef1cb5829d81ef967dd7f853d8c
+size 1039056
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e2f44fc4a67e92d3dce959b052356ab0a0c1aa5f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae09566c17282ab9eeb00aac46f2b56bbe4871183dd4eea9e30e0bd5aaf734f
+size 1300084
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..726ee2140b58b8503d1b46fd925ef8620f4d1d27
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa31a5e7ed6dc04c6c3f3225bbfaca69d5fe8efca79ad7f4c0376a89d9ba51c5
+size 1561217
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0643460821110bd8477c1ba258d2e50edc73d6a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d0f8b121455d10c84a332495e62c4ae7c1e083100764620517d2f9c5c897e91
+size 1822671
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e375d31d223e5aa5230ddbb1e427a523a0a419bb
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76463260b28a9bbdc7c13300c0ece875a5a788a57d194262cb10cc44deac250c
+size 2082997
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1520f43aa421026d230f5dec44e4569050724e01
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_Replace_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3df692be1c02e239fcba4ba3a5b29fe088e1f814d3cdfdd79fed329127eca9b5
+size 2343698
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5e245f293005aad140bb18e66c2ed064098d75a7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38a93b006d8627048357bf8f028bcccae590a0c17ec1e69108b542f48880ed6
+size 948111
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..080d2771d6efdc87c97e1f21fcd3022a14cefb38
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a167734f94405ca90b373d60080f2b8f0fb43dcfb5f8920ad3d0e8ab387464ec
+size 1180661
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a823c54ff838b66d28700ad22918b51b01a7993d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9900cb53b20974066efe0ce5ef385bbfc5d7aa782483a1244cb16665857abb5d
+size 1413122
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a73c1f2023442f1fa2455f2d3a131153411febd1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c20a92f9a0e43568636f470cfeda560476b1e3f2a3078381c097cc05f200b30c
+size 1645971
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07525d3c642e951e1f57ede99192c6ac6944cfc4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:898201e5e076564312fc934dd32e63954c2a4d715bcb4085f9d7a65025ffcc9d
+size 1877872
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..602b463e1815575c15ab26fa982b50eaee98ffee
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_True-or-False_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:478195e6ad14ca904e539a98261c7e89039d3a2a17f9302d1d3cca73dfab39e3
+size 2109999
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..20307f9298887a3215a9680566626dd80c29d31f
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:548ec5698a93fbd338ebafda17eeb32645ad4afc7452d76e665e30d2047775f1
+size 1010024
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..079dd56ced6aa6f65a03ed29b84a9c0f0e860bc4
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3843f7697a70ebe5df22bbc7527b6c931bc874aaa5da1d0fa64167470f38b123
+size 1243224
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9fc2a425783dbd25fb14381eaa94c1e6910df202
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2bfd05e2730ab222bdcefa8f21baeaedb47b850072ed0569e3c25bede7ed6d
+size 1476393
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4e1b3f4c669419be4cc3a07337c7956ee74c0699
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c21c3ab691711778e85c577a1ea95ef771db96a9a88888bc970f90e44077f8ee
+size 1710009
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23ceae7d909f379d34916ac40c4325bd05ef5e6b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21e3040d9375fe99e931ecdb4501d52d8230f433f9913cae9c66f4931abfdc98
+size 1942423
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ae7ea1378d8a8e573492f8408199956e1cab76d6
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_does-underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c956e9039a57141fcbc864d36fd6fbc2ec29de095b677225b337217a0d6dda96
+size 2175322
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..88e4ecfb6b7933f1b83e3e6a7a87ce29608e6a47
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8508f4f0e0cf4b6990cca2fe0350868a4da2453b43849ae0ce5b0f3f2dcc5ce8
+size 969513
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0ce34aaffb7429b84af991e9052a9f3439d1a763
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:222d0304e2cb761cdf350443cc63277167a98784dca26c2bf561e295c158ce71
+size 1205147
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ad5e5013b1f33b3203e404304581f0a7979bc6e1
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:884cc806756dd9733e02541e989597dd6335d4321a70275e10552516facdde2d
+size 1440916
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e831f513ccd2bd3832397e202ca324dec045bdb7
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:671db2206e172f39530af20b0a33d9244bc4adbba056d866352798fcb5a6eb1b
+size 1677034
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0b4578c198c75e8196ba09ee8ace440c9a80ecf
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cd550c1a0ac063a34d24ba27996dac9d15268c560b11db8327db839540238fb
+size 1912010
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..84d66423e9b393948e4cea616240c6cf58501c8e
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_stand-for_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b166a65fc55d9f070261d918a63add1b3072a606861993792fc1ba8fe64722b9
+size 2147375
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_0.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d6b15b3be66a719037041efbf431141e5d886a82
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50a69d9a6fe698454f3d35cfb76d0279e5317f4a2505a555974470ba408123ea
+size 1016390
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_1.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a5ae16c0acaecd08493a9e5a91024154a3d88aaa
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f7c1e46d9239de683332f4a41c811932dc2cc0c1429febb6a4a556792aaa93a
+size 1257169
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_2.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5c6d57d13d776b4019db48ca101a4d228d8d6949
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70bd3204366f236696957464b222c785f8c78a81e197cdc6c37f1e9728311d5d
+size 1498020
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_3.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3894d8994286851c5f2bbc57cd0ef301356cc00d
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b18dca6950b2da51d4d4a971864bb1b2ecad7f10a076172d5bcfcfeb2c43e520
+size 1739191
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_4.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..813895f11e37e737b978abc4b83a3f9f51b66a6a
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf02ddeea62eb5cdb0f5b33dbb78546ce70ac4f5efbeb4aac6fec396e9b2404e
+size 1979202
diff --git a/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_5.jsonl b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e47d988a9bfdb248b236f2f9a88dc59a4dc8f00b
--- /dev/null
+++ b/4b284b42boscar/eval/examples.4b284b42boscar_winogrande_underscore-refer-to_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:883227781ef62f307afe1a96ac6a134ca933e45bd216fc33a4558a4b7f6be493
+size 2219642
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..faf1de1e1ad17ab20141d34787da161b95829699
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.3920095188313853,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05821741416066471
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.07689265316885617,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0021889644764877354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3353633364771332,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005464034559346198
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.10974192875795953,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002108481379704356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.03467177168437793,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001263966142663848
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1501022471408039,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0034656326105627303
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.04944058280627724,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012733709222344247
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.07300762564486572,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0019791385357395687
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3239944481490108,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.005278665955249544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.10487292177099015,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019145881199173131
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.07173881674469755,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020327580166630914
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3108299199198791,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0050441398235876655
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.10223587057095268,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001953957091865035
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a06ddccdd31bd5863cf096563a60b59544b4de1e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5923583934046589,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04413735939354442
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.16010050130964712,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005125042778102285
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3196355072168386,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005119850964520928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.1697750677390444,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038772039426472724
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.08438981052215226,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0035428710754014017
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.16375709757632376,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0035801758170607334
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.08671853497591266,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002631475278488475
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.14464852171051198,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004587920039107864
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2991072637349091,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004697289445064134
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.15421967175389004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003335409750319338
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.14773643541948675,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004697318924441809
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3022253960230997,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004739701115545982
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.15698432412836977,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0034258507511626024
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c15805902f88450646b0427a703e8ca82e6d5e8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.87481699127398,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.059582002804657114
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.199466981650156,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005680133053279315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.36987834122063556,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004990035433475184
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2091307571621392,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00438758838932181
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.1097332483868951,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003880260643680688
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19989822315140882,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003891995132588205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.1125014602113282,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0030818329485191713
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17708218948059182,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004959206002368757
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34473442974785634,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004653107328809787
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18777604196623215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037686741944170416
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.18275767079289593,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005158519349266604
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3495549840141228,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00470047440035223
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19268792375031765,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00391282714541826
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b15ce77a0c72ca87603cdece4155e8b5beb25cc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.917052522385174,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04515993268519611
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.20888904165496103,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005880585457367179
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.37814776640994463,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004965150831364198
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.21372711407619938,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004343895044489938
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.1168574620986129,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004100382644887659
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.2033520821796969,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00383298408380243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11478946258154298,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003021059673039041
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.18605811290952207,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005155344402670075
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.35240104768920266,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004560857947768766
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19232862909601875,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0036812800257055574
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.19147784744930493,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0053469173736333826
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3571044253500232,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004630285578564224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1967270339578505,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038181408662667426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d7b6a9891726377525988a802cb6799d384cb8e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0770812969732624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.057150178907157206
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.22981660265238718,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0060091983198828705
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.39885875556534467,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004954969825753986
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.23556795007723105,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004533277698287348
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.12750847088352804,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004054232891970157
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21938886368566654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003991587985278903
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.12790770467752932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0031721283738045283
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.2007562947847461,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005097419240769892
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3691187056685199,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004545072578815252
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.20953786614403025,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038198330880285674
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.2090073734779189,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0053710149363667935
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3759411168888221,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004622209838017953
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.21611300792836619,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003996019817069793
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d01c7bed5f1b84ce1c8868a18272d98cf0f28785
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.1504048600305352,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0755822004199928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.24380689117778276,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006371373530201665
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3952201821451148,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005033224197794555
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.24203388956598934,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004720213442449205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.1418504594039488,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004575582136553598
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21936500213450158,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004020020751334439
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.13474976693002946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003415392488984557
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.21441718014302874,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005537317704553182
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.36454653195485753,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004645366026599087
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.21539137146469606,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004028386529865914
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.22298936285767557,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005807349708841443
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.37135465454110034,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004699420865300012
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.222229551042884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004199234047990457
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b97464fa4fc260feb876e80265d46f9139591b5c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.037033597706448665,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0006480174033579744
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.2658754336065589,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028536856339901125
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.06288652661017977,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009866252295731882
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.0029692249314187723,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00030354572703394786
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.023302986599903073,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001796060317593677
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.005046491430199868,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004836957271082255
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.03673799721474021,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0006063411247670403
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.2649595962209743,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002781644829670126
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.06244410003667717,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0009275108134304108
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.026375944242512106,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0005028801609392026
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.19861431821513748,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024233727810549985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.04500463562063431,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007706525242560116
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 0.014682233268574497,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.003688525019018053
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e3dd2a092b8393f314c1b362a71e9e1eefdec14
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.4990908768667419,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00672293123926927
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.3805259635624399,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004958240532995044
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.3790094382887789,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0046211045713230115
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.26611927734105884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005475714121309219
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.19307213953268704,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004131999445973537
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.194190380103921,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003903903754432061
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.4143553380956953,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005965933002105354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.31867009667119994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004444126461539901
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.31348326041784963,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004037814530879785
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.4395322981032592,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006214073376970294
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.3327332188496468,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004497692202972866
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.331045098999967,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0041576690263424
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 7.474586456502016,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.39828351246520366
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad1c64d6aed3e88f4b5abff196058a51b707e759
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6659469751218693,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005282697210611973
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.47545834078235505,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004786244888221911
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5116416549448322,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004114092104651227
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.3975962388061356,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005108496306917878
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.27663543585966394,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0042716661772286965
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.2978341790256733,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004004300858372415
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5552137990129092,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005109973224126155
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.39635198024010654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004504106157915422
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.4246199460652537,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003945834399770465
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5888915350791896,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005163507906173889
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.41693152931630784,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004490602016806981
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.44860309585930136,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038862990100455236
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 12.866414027645662,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.24833464289152501
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0d350977d548a0c0cbd9195cf757f4eb05e8102
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6699599078220461,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005092472906400196
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.48581171511308424,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004848016885709874
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5204893657590925,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003995053794749861
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.4035890102326372,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005057285855009103
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.2874311239495429,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004360261608760115
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.30735803941946105,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004029624004352192
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5579808989293318,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004963235286574611
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4043034901530828,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004566274351211597
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.4316278841611481,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038744793307385252
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5899222181000761,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005023121673406623
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.4243581128257757,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004524193813515205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.45443142624873073,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003778925289442561
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 13.405106075117686,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.21265154164644462
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..91da9a48c83300cdeefea8c553424f51d22bcfa3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.679557178036161,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0049896787371130945
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.4926458422706465,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004854101278525955
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5315805694115959,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004012328176756712
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.4129644172378307,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0050022305172101
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.29527142677608653,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004432346853014904
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.31723495338229024,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004058983755190999
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5657042645684208,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004851123687930964
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4105814772991126,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004560892673388257
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.44156064111814675,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003921560686510802
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.5995826786382206,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00491266364396876
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.43150554107231065,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004531011417146284
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4657105547522727,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038479056635345133
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 13.745741838550028,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.13695001364890935
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1278b608dbd919904ca4ac9de803f49e56e748f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_explicit-graph-description2_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_precision": 0.6826576556901013,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005035472419306049
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_recall": 0.496039516340874,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004919796581238693
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge1_fmeasure": 0.5338363845067036,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004077687650001848
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_precision": 0.4176777178908642,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005001554560141764
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_recall": 0.29901241727774297,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004423169847085888
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rouge2_fmeasure": 0.3201869787310687,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0040154529017539875
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_precision": 0.5695040159880653,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004915689277082928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_recall": 0.4142614814635289,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004604637322440169
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeL_fmeasure": 0.44383512761730387,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0039016002306109226
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_precision": 0.6022615757723847,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0049610108640678885
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_recall": 0.43461793110243807,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0045641302637812915
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "rougeLsum_fmeasure": 0.4674544381420857,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038204072100002187
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "explicit-graph-description2",
+      "bleu": 14.058124934994158,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.18995835187996135
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b418d93d667aa5eb3fd78ef050895ae10b8b0caf
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 0.3684892812116536,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.036712135855659016
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.04055453069407477,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0011144496310485231
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.2383898367427312,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004435035822455946
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.0639148965678454,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0013744475750369117
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.013765451487632733,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0004999336960221954
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.0819997006913652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.002759657093416114
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.02212990258629694,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007608947170272215
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.037636089436926534,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0010064070012691636
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.2262791588390035,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004150274619964078
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.05950218629777505,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012027985690621355
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.0343763430204763,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0010307718598180854
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.19898079264280746,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0039718249449850924
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.05377498340164964,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001226466015770773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcca8977789e031b497c54a6ab75e6b3bf8e411f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 8.146495790721922,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.3594818741787167
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.5473703579890434,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006234195978512742
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.42594641383154397,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004933350696797209
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.4232777622497728,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004489922497303896
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.3031075729839342,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005143255574055858
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.2292919782826598,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00401177364228117
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.2275191754276285,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003794704955104901
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.4607152705533072,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00570887962645297
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.35839428699463327,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004451667967473151
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.3535992424841204,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004013818428817113
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.48504564210358153,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00589786396794517
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.37384075852237625,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0044950626203494985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.3709034906044357,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004086212522565927
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d814f0fe2c0a8e75d0660ae7648a0f4fa2ae4b44
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 11.965909947656085,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.31385721546757483
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6536501149948257,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005409528765327417
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.45486150118372615,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004815350756377679
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.4915502890453908,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004135175958914253
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.38878972701326525,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005187583200225757
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.26314783989707097,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004159223323165467
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.28440321602825763,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003906016823261451
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5524708911463064,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005232101406770008
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.38340825054880595,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004460889502364635
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4125596874295932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038819772760533638
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5799349457482273,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005328155965064084
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.400491019067507,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0044816598272816015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.43207188694214654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003876435956885669
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6796e6bd49dea26b16db457bd0ebe5811be8851
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 13.13186528743504,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.25017887243204956
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6617563503433895,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005307576635275494
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.4672733883523212,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004779826795862267
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5046364417249789,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0041137000959081205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.3990665978536603,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005170131087123088
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.2747669553241779,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004209867424129332
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.29697113176548073,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0040088345337766715
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5583282235127254,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005147780429733306
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.3931717940655256,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004438132016764882
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.4233054491917572,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003914713751938626
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5859572766342701,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0052220876338499615
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4098618562170075,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004420858442059482
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4427213766987302,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038669409347444014
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d87f8f70c01a5f213074fd9410e9a528fe4478b3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_implicit-graph-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "bleu": 13.328668063663947,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.20856051056572658
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_precision": 0.6710716761451476,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00511429409385906
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_recall": 0.47175315947396945,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004727398989004289
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge1_fmeasure": 0.5147108147070596,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0040421844786449005
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_precision": 0.40579962182414614,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005013526043842869
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_recall": 0.27880930640418905,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0042069671751406475
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rouge2_fmeasure": 0.30417235473262294,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003942481114293333
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_precision": 0.5660704860358193,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0049534592472307865
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_recall": 0.39827745149126054,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004420352389146525
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeL_fmeasure": 0.43294752280435206,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003872021745790226
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_precision": 0.5962684608451612,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005023585380290913
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_recall": 0.4164845623261301,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004403741091135774
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "implicit-graph-description",
+      "rougeLsum_fmeasure": 0.4541200525443879,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038206569305483883
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..481aaa0a4409aecd0d061a541fd05a56848b235a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.04944814629677289,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001497247190770107
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.3078611471396491,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005303327109994828
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.07979619965642178,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00202734988031741
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.018309292582639056,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009363142372349108
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.10973537747729963,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003623531828256602
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.028978222976628863,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012880735235947889
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.04549819729320685,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001251826643585312
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.29237462013260235,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0049507093051764355
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.07400909397863684,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001739473777782939
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.04274858858845132,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0013570541361121234
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.26685706989523095,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00464097795566496
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.06879897817639219,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018242627181544728
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 0.6689759196732665,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05286049862282843
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f670cda20459ee433f3b0b0e624b55345afa36e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.4827311075483846,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006077411990486251
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.440560161659474,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0052302780699066
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.4035238082909345,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004570572975206814
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.24944030688244834,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004802331976841843
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.22747538576384696,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004244017550736157
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.20506994083869234,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00374059622154692
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.3996179052833305,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005390496217652051
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.36687989332089216,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004656909354140392
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.3324034993111807,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003956103883993652
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.42409893318395286,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0056060227641647874
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.38527217880573095,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004721852532261511
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.3519063860696538,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004091141865598614
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 6.588467749744722,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.4032245951577365
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b30ddcc86237690202110f6fb59e6d565fca005e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6218232316667821,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005485945035845688
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.48929591497649727,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004945740426108298
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.49917842830527476,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004251608550641214
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.363761721700491,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005070900253455173
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2813926579070104,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004320741447701495
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.28603637886233174,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0039899044556314595
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5146727177802017,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0051593661561140395
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4058527142766151,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00455624921981018
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.41145761111444557,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003932155746306229
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5445021126408139,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0052472546111600946
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4277096020477725,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004603985369731013
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.43449798192526906,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003939338221863128
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 12.691912397052652,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.40247041652067606
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d18338ab98a6316b27b55937d52412d211153e80
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.6539516131156088,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005120945883448623
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.49134395840583855,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0049736746384781375
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5196984502923798,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004176674618016702
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3881069659684622,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0050054138504511915
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.28865850153467393,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004454679661879614
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.3039382130147192,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004145367604949371
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.542464954202006,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004908401431806659
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4081820258818348,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004608939681390705
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4294401372716624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003929315367564241
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5761616210022392,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004963802338610327
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.4299823565839692,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004579751956132399
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4542192589085261,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00386272709242378
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 13.687688261339932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.19387514723356358
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..096b65f047e4bd4b1c9b3936e1352c450927ec1e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_non-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_precision": 0.666787476202095,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004816802985476908
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_recall": 0.4824167251076838,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00488554294646065
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge1_fmeasure": 0.5205513700223448,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0039244527177236
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_precision": 0.3987364626703962,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004862803672938963
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_recall": 0.2846654569322822,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004369301517990411
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rouge2_fmeasure": 0.30482876318174756,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003890400311031704
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_precision": 0.5557839457874688,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004740742530325836
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_recall": 0.4019048819024202,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004526221932495017
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeL_fmeasure": 0.4319783380790841,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037571034362471243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_precision": 0.5876109824655803,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004760377125248539
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_recall": 0.42154188186799263,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00450828933959814
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "rougeLsum_fmeasure": 0.4547951881516365,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0036898013140689865
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "non-explicit-description",
+      "bleu": 12.977348547129905,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.22464862432323532
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..184821ebce1f0bad5eaea1d9de86db33295bc786
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.04063152255085763,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0009054683111284807
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.19521133552459877,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0038813865060671627
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.06052568382193392,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0011363889236589205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.005054974672803902,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0003777424396489787
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.0358093728893402,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023699383967658925
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.008292494676019994,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005734768199869111
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.03526164999991809,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0007791455445977729
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.1711738377729294,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0032942261964877483
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.05237101199172248,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0009524720060349769
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.03706225741025466,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0008484227497407326
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.1794475729484543,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003570964070190367
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.05504659493150974,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0010385645573037228
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 0.17302442690857225,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.02088479151120539
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a76dd4529e68913ac6cdec8c614997d6a933d96
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.5266983058761379,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005473280184276909
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4710372208511584,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004978818208675463
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.4486051628643562,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004239934043509985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.27552483520049653,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004546192682283346
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.24719355667721984,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004206623227614193
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.23227922621904507,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003721636880083816
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.4297432923758641,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004966273860866354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.3857495217657205,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0044865340856255174
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.3642156831080444,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037789972737372617
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.4578436021001714,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005085772605555204
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4085222771166399,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004575537252880692
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.38745765194476134,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038618804528035827
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 7.4227316699041985,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.3050632292572588
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..63e7fae7bf1863b1f694ff91ae72ddf8c0795b6b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.5860706534652017,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006142796102630763
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.5028936278021936,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004964031388238394
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.47904191685874215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004610233219452129
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3366055690160356,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.005205078374672028
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2817439584805951,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00430035974522067
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.26883463217035797,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004056807034581447
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.4835386188892664,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005530759641627937
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.41990625782772617,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004627778482080496
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.39389153715288033,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004103096564742158
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5126095254492385,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005737254659292593
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.43736947708111573,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004561221700465661
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.41563558935111344,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004187263162417417
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 8.626424905603868,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.30084585108448175
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9088921edf2ee88c1cd240f01ca802ba312d0b37
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.6363403403216966,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005423277136161756
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4955972233334783,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0050068211886118656
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5086336073047357,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004319061668617563
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.3645817354081736,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0049385804033239104
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.2825475726166293,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004406165781571923
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.28809184008147987,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.004080653299670392
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5214189737717493,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005060972606517546
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4081340048943837,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004601320296051857
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4159281452387241,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004008282134454578
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5555281548058759,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005198395944020702
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4305573948340927,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004617202953527601
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4414249404982676,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004034492167499606
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 12.663443364165664,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.6003687307907446
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f907ff1a627e0c55ef0658ef0cd3c692403d1253
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-web_nlg_en_very-explicit-description_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_precision": 0.657705603414944,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005103281874403951
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_recall": 0.4846039210835352,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005102941916192306
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge1_fmeasure": 0.5124136340794241,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004227034125527551
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_precision": 0.38405400325709016,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004889050644854438
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_recall": 0.28231903680054155,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004474461877919754
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rouge2_fmeasure": 0.29573775508453526,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0040428348803391015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_precision": 0.5436519042949046,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004878771074784812
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_recall": 0.4010305116437114,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004674690962415214
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeL_fmeasure": 0.4219068270959287,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003940905009709809
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_precision": 0.5752358432393784,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004969306936657398
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_recall": 0.4212144900895478,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004698142381367077
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "rougeLsum_fmeasure": 0.4447700952492226,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003945243545356445
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "very-explicit-description",
+      "bleu": 13.283879544265606,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.18046364359305653
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8530f13c35278e6ce066a29bf42895f7c52d653d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.13503566455470678,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0024040162474698724
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.23441729288318755,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0036599750426349466
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.15754810601459762,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002445300070875694
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.03140531450622776,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008248347072909491
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.05772646805767443,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015906532638753756
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.03727817770827085,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009271630142541763
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.09384208659305457,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001689696736451482
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.16871420996645434,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002758914891553921
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.11016974085290407,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016745228271743341
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.12513519617347155,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0022485478817274706
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.2177607422267832,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0034240718038598007
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.145932266658728,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0022684805534109835
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.4999917470932496,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.09136692821965282
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5330c5614f9aa092649ed30b1bd15e26c767b01
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.1505381571626013,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0024564237561880524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.2351558337379536,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00345563635731422
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.16455606393487338,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002294994166116859
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.03428549150065371,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010296385076122587
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.055229584003465094,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015302468074412797
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.03719814046102062,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009354291436813923
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.10761813485443772,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0017831187131585585
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.17102216865216144,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025824199249945834
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.11732986444478297,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015793448331767731
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.13982406382793042,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0022970904948854606
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.21865517328256018,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032228483029092827
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.15264929345779932,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021225644143411724
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.2010538746394595,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08101371643710994
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5f658de318863f0a528c190ed9b02d08963ebb9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.1876293598203017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0029674404218121616
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.25230478447120946,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033884919761666852
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.18747231585436525,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002366408430604848
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.04975916902771021,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001487770429115759
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.06360606183941207,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015975217889021207
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.04722227777064152,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010961277893244215
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.13784498031331896,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023167225723095476
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.18552837609394215,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025681044505090965
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.13609982397073794,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001695861644814157
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.17452434804986092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0027761425038835114
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.23522852871681593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003178841751082744
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.17439134256209984,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00220237936065734
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.840300351651135,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07130210999169914
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c11c56ebf33d3fd194a7bbe6109985f6b022f4a1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.1839935150260243,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003346950875411787
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.21800826428765932,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0035395802723768343
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.16819036448780483,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00249732983792147
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.052348258405649814,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001782889854536389
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.058048612213817206,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016463744725270463
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.04446177615823931,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011470710110295435
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.13974270455081728,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0026867179254436427
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.1639987000897844,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002733365684474388
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.1254498650320625,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018563840983896952
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.17192952870829883,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031753297530428574
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.20270410550643933,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0033069811684398828
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.1564584387566433,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002329587318436108
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.778235976798028,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1174187394147134
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3286f4ff7b8c08a895ae241cb7fe5577a973f07
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.07175606709271613,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.002883021045304814
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.07370835441795094,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0028219227214772644
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.058775945272166535,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002128501293673969
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.021225726932429398,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001353702211218657
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.021001218125129863,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012128317967941386
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.016525925525545243,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008902078561462972
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.05572046585928227,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023257534664432025
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.056105537285426776,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002185488741676815
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.044519899911493195,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001620071457669129
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.06641449373056402,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0027052374207703375
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.0680432273603457,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0026209040416315887
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.054134016173616836,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019690557767872534
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 0.19229073122355,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.021891114094233034
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..45d83717d7a69de7d65411104c0acccd7d35b195
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_article_summary_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_precision": 0.014860040829321053,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016169810385583425
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_recall": 0.011501893332992012,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0012000235529442773
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge1_fmeasure": 0.010197681105541754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010205284597569913
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_precision": 0.004286038547963845,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0006750544266144003
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_recall": 0.0034999394129427755,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005036071046640206
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rouge2_fmeasure": 0.0031538730485515167,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00043997651618954913
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_precision": 0.012372385966053627,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0014249396336128456
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_recall": 0.009162693605863482,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0009679537038074917
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeL_fmeasure": 0.00811132739080454,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0008257902852490833
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_precision": 0.014160515487803211,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015675837920299837
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_recall": 0.010763669777769231,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0011239509571883462
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "rougeLsum_fmeasure": 0.009545008500985965,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009573811061672583
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "article_summary_en",
+      "bleu": 2.5334089241458653e-12,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 7.473669615841683e-11
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9de8b0435495c9cc9f84035dd9067ab9a378bb8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.08127870970387267,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00159496450472922
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.11614055304212574,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.00211819393542503
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.08751176126000156,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015289149605506355
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.011010781180132406,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00046413206931745474
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.017353929713234406,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007980921461916622
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.012313525594602524,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00051069361487657
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.0670588180579985,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012403365589683047
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.09825523452607922,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0017514381154861217
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.07274632268029439,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011790060287316787
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.07601033342044172,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001489877196693457
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.10896403734860502,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00199374637678252
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.08188486629306464,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0014230045630912664
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.7090027829444067,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.059403705322662224
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd5aa4340971fdc4cdcf88972f283bc9cfa4079c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.11718371549480533,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019207525066087428
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.1234378142215776,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0021119106085494836
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.10439627151445018,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001492503209077825
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.011169246181700449,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0006980956389803306
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.012611453444703798,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.000789477613635532
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.009754190532310308,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0005251092959860216
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.09349645401843876,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015339441744450736
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.09820200472415783,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016478612439534423
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.08249721608530192,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0011129719074541209
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.11232285776375517,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018301585055001148
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.11825003875705886,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0019913607518053916
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.10002969343936566,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0014109391973704906
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.7784287218692226,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04647782988951344
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..92f7851f17240daab572da98d8151e1487a2133c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.1758303182294891,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003462451959062334
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.1883786913476069,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032322227595843827
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.15023928960073749,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023557832440037887
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.046013593879532945,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018013389641468377
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.04570020004067074,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014354529428897169
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.03619017321536887,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010416765949087755
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.13643697056796278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002803241654465616
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.14527010548054942,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002530982991259681
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.11461382411831511,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017522041790913572
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.1648841707672492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003290762730353922
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.17601314551035446,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030249867013223885
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.14032663089975259,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021996161120837656
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 2.5720898763538185,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06900520135924704
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3aa4d65ff74751a8de0f9a033b5d02b9bfbebfb9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.20336271876650794,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004029433819785622
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.17729454008110915,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032685763318790126
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.15150667381006758,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024684929994384665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.06253967117819685,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024298335038909547
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.04711446698399212,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015238057694360616
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.04099567404353483,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012134639956742775
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.16300354995627306,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003457637855102496
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.1376210474869735,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002552749011506665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.11762153288067587,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019023444904376398
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.19133776900682994,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003854597063658495
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.16579113722679306,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030775732756182375
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.14165398317895528,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002319209416009038
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 2.40664978256626,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.10165209107523865
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..db22d6fa101f2f1d51db7015c594eef3c2322f91
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.07570691992215821,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031769105627284205
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.059481738771106525,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0024694047666392245
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.052169934920531964,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002011618006335426
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.02312729904071614,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016638788683403377
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.01723565302833218,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001102234180757535
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.014696313352161674,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008734069534130452
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.061644364211395554,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002707542539889897
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.046759300119261486,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0019578754806582866
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.04097311985693082,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015825649237898086
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.07138944469911805,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0030407812924645727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.055567400646258695,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0023184097053874945
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.04866449318467239,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018776959939917014
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 0.08798252672366903,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.01615147537721119
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d1902dad0316b0430bed2bd02d7b8f5f9f3dbbb
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_rephrase_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_precision": 0.013286397211624563,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.001504677557742547
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_recall": 0.008705964916807795,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0009664416727583056
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge1_fmeasure": 0.008355758385893338,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008892610122505367
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_precision": 0.004518509944468516,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007701323271558087
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_recall": 0.0022539281604353444,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003261469102752993
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rouge2_fmeasure": 0.0023400550715016594,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003377640465114735
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_precision": 0.011288412771902762,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013224615413419574
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_recall": 0.007132645715713445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007894051530351685
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeL_fmeasure": 0.006807029265944781,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007179828918381072
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_precision": 0.01271335891292784,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00145484420282144
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_recall": 0.008225269451860548,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0009152493951939503
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "rougeLsum_fmeasure": 0.007873172278594525,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00083319726831352
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "rephrase_en",
+      "bleu": 7.837525856560887e-16,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 2.2469168934061554e-13
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8bd5c4a284e75ca72ea2a91bbcf646a90b7aefdb
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.08356461827231487,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0024170033290248937
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.09215261028156187,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002030742204450942
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.07457954055741463,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0015706133068032748
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.008089449665266878,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00044542591051177844
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.01060270864330856,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006135532813043637
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.008251799871683644,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004400505526509918
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.07378864225361224,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0022398054335912194
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.08084815666892478,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0017354953004792963
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.06490017831903919,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012990934205015005
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.0773053527266884,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00233032304469025
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.08512472218977873,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0018927606261400106
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.06843881980738721,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00144201624548823
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.4965418693062675,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06336636372085828
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6361202560067c56d87bf2e9351bf21d4111aa3d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.1281252372198017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0020477888133567127
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.12497569074861344,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0019520455760729037
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.10906609141044966,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0014081519487517005
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.01185449066902512,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.000840457069932496
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.010852632040411836,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0006918502465566141
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.009018221770209143,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004951723995251364
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.10275230010792302,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016619122004975422
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.09981624192731062,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0015160011216860429
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.08664583891836886,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001050482144420516
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.12279548165800759,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00193310449452201
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.12016569662446316,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.001852297919694553
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.10469247701628683,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013253125566948223
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.7256071294825857,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07948879643932899
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4575228ec97bfd0ae8bdd1abe34b0afc896d3b14
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.2130994513227411,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0036353936247548735
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.19599404536132142,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029965609835451374
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.1676452338322572,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002227551090748977
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.055485760051538865,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00215799809893319
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.04479590241268949,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014249893850712678
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.038230611510681065,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001102150265059144
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.16773450143295235,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030808144747857297
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.14980977816188165,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002281955305092778
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.12791823608415107,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016384015342240182
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.20253341704063588,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035146252518724183
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.18480217629421566,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028137217195253915
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.15830475139273484,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020952349680806023
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.6332112820833435,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07276165331548821
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..61d70d157bc9ac7c410e626edb2d45b84f206121
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.21405966081177125,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004161193500566949
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.17673627728619523,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032206945936224464
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.15504698516533022,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0024750219681964492
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.06600877424854136,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002513661939389598
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.046653184328022644,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001511363884859671
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.04072525071554273,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001154064126239323
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.16992128001744655,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0035157095180261316
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.135841050753072,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024938095859423927
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.11896885200232418,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018574676959350028
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.20278412255856923,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0040134091234927125
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.16584081593068659,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003022250624756856
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.14567952073605564,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023259948668787433
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 2.405742021443971,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08765857428125907
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d9b6353c8fc24222d981a40239a32865919382e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.0764117688968914,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003310015328209796
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.05500790234723034,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0023533448786149888
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.04985418381603688,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001960922158649172
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.024322633331148896,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00172947059164052
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.015488676770406946,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010366733748569117
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.013743928699783595,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008105948544237309
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.06259275682135215,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0028290362323728545
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.04314953057223309,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0018562044556596725
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.03920084186369486,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015347913250690194
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.07243692089182784,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031760715288622755
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.05161651824502423,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002210435572023429
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.04674099530422977,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018346514063613708
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 0.04603112857491259,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.008088785186144275
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d31e5abde386893e1f217966573289035975c89
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_summarize_above_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_precision": 0.01158797019081859,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0014113661523309372
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_recall": 0.007589223568536127,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0009189129383816963
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge1_fmeasure": 0.007387558901390138,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.000843094067827481
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_precision": 0.00396343343286023,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007692116628974225
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_recall": 0.0020173657517965145,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.000354114887572235
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rouge2_fmeasure": 0.002066764939323049,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0003674443571385706
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_precision": 0.009881225809305108,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00127413856729575
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_recall": 0.0061423589274229315,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007464014877194372
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeL_fmeasure": 0.006005050518755766,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006965777440156409
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_precision": 0.01100925224171755,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001366746162025473
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_recall": 0.0071530057213977545,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008733261333433707
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "rougeLsum_fmeasure": 0.0069366139221113485,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0007971177406173816
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "summarize_above_en",
+      "bleu": 5.838488988667251e-17,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 6.1928335157223854e-15
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9b1ee90f0cd9d46125c774ad1fb84fe2e9a1669
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.13065355742673593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003004776280579319
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1786059794590456,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003480497255310532
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.13547708241929612,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026556069493799896
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.03291780593276675,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010760829551416996
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04663674589032859,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001504992918818976
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0352234336154376,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010798725770486046
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.09987150235228559,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023685337113851174
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.13959473564876165,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027581190918920103
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.10336654675385076,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001968971023431868
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.12288788441024788,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0028720387887482485
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.16759622234914887,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032867178590762366
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.12705867398208964,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002506787857940285
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.0868626526463756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.10334620651927755
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..08cf79c054c944ab6ff084e9c30269a6ebe3f298
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.19126686092152348,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003082884423906421
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.2013507500089625,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029925213818961095
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.16580650031794167,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021193004105035614
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.044582320866862424,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018321118047088484
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04442769282883079,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014782140733801531
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.035746404962808155,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011192977008782524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.14988887413463797,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002586002929489674
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15563614848002308,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023345594461348924
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1274347165087169,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015926173758816556
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.18119105294163743,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002938341173461725
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19011415446724061,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002811944286183604
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.15656227686437413,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019872168695675768
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.5514775583592173,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11284261690010776
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7199fdbdecbaf9be52b97e5978f832ec9799730
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.27518353034811777,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003681886785126964
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.26994010795497553,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003076759192772023
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.227527675745211,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002262727638760964
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08493527530542791,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002342907743897752
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.07562522541046568,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016670921194054875
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06444156598905733,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001316135695695516
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.21378447499783487,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003086057725507419
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.20706303996123673,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002420883132061566
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17356357206141904,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017138094026774846
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2594677476381253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035263951440822477
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.25382240907884707,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029022538349274
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.21385148741041204,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021335293397074695
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 4.160843322149366,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1298621943311205
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2abe43ec49c73bb3d6693c4b9e96bad1d155530
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.25197379414278026,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004061114481390069
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.2241545120711431,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003349453782690701
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19542809253461213,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002567437085860653
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.07748731951936201,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024143985835070676
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06396850914384923,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001702594494628315
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.055649339745118376,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013436864732712206
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.19629979985917206,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00335622576475892
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.17220070773118773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026273647249788665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1493802268370947,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001958022255827534
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2374958020095092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038790177970112016
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.2104711045184697,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031578834852280147
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18360399651239948,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002422426009983074
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.550269268318738,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.123976667805652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..641477d20d4b910b484ff42ab0f9f2935f7bd89b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.08736393590844754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003320236333651585
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.07170734676784853,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0026863536115591326
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06369530530813045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002225322220412946
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.026539844024013316,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016227872678637359
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.021485803029836836,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012234458979275497
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.018728966913288573,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009803445596888972
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07026197905523278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002775489585285312
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.056214726744091746,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021292176604441565
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.0499467876025126,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017576059173842731
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.08238350525240551,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00315393172421982
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.06716878885751017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002524004286911468
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.05976166581566339,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020932837351865146
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.18727975729565796,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.024743677554065382
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e969c4b63635f130c10e22a8121d7ee914b777aa
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.016308055800076632,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016664962689093485
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.011189655817252247,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0011802896184760587
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.01043565444271075,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010239464330389257
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.004996639391364696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007412751630573523
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.0035850495924414864,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005653334006649237
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.003212219718708495,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00044504346236305713
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.013336917042039756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013953475029781011
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.009180912567410115,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001001081279334972
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008463888867128002,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0008498965362098245
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.015695607203614483,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0016199770379973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.010692369328543307,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0011385128958340267
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009948416660924693,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.000978895492286139
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 5.577157241826622e-13,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.1569346422902409e-11
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..27b2c0bca126982eb1af3b3494d2845445884eea
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.11966632209944479,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0018289026636228675
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.17699917791063266,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0026232545831137728
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.1301753308413737,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001845181946043979
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.01769714030024297,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0006122997523884595
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.028670426399557025,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0010678780392830523
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.01998190512680113,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0006699945809559366
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09438727023667497,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001303032090526388
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.14103328454918124,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001963566607993366
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.10242209288458175,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012780368890878686
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.1118510836616567,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0017112676227580969
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.16569303942590743,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002457833604587797
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.121571478712092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001719907239916338
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.03372792986763,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04498324238148764
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3ab282b51ef4cba3cf8bd5ad15b8c284ef0f705
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.11863961702545356,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016088988795161811
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.12008771066629581,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0017439629461919743
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.10502771804225536,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0012846977475557287
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.007275016003204525,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0004169535248542494
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.008322730126316197,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005516341243080924
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.006731410059333951,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00037866401637734617
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09461567478817941,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012475184392361028
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.09556473399275549,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0013339685617020215
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08310986365787817,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0009355454004341337
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11442130354097658,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0015357165164883602
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.11581351195921423,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0016588658424958223
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.10128744060086141,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0012205159612731504
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.5402539021847423,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0624145046719431
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbde83bff9061d431ddeac4d0d40e2c3f0f1ba40
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.14826479664329056,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0022603198325511016
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.19594580754248062,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002751527088887584
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.14860219897828134,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001917302371146199
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.02493138347514756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009874915174681875
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.033473322025973654,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012512631437445213
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.024563417541210154,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0008202915878607498
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.11089102163547263,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016898972686976235
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.14784069855662121,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002072707847991526
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.11041398908642593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0013335422583366145
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.13993502338514574,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0021373400435202843
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.18435209978526113,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002565912635880525
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.1398773618649369,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0017851823888757378
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.623453279222236,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06273494818292862
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2842894d05d00e3a628f9c3b64a9a10b8e2ecad6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.12219509555835543,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0026166419024163078
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.15850063378476156,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0030215463841090718
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.11745387491243063,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021004572141077632
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.023677234953105034,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011613452323130811
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.030381539120417637,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012727961384364653
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.02127276979142645,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0007674253666113632
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.09430746394042341,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0020959207990629792
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.12231473215281553,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002337973909999958
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.08912109764317326,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015176601962052629
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.11499260596518415,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0024911178674758127
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.14866443294458095,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028317609417797232
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.11001570950646093,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019597563244517636
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.4377303685533975,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11251563995967095
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a863737e0088aa66a421363a103d4100003df916
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.03310867480312166,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019161965531372164
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.038587507598865006,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0020099564019102865
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.028565135483987054,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0014237764447699868
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.008639108153884063,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0009900284053428036
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.008463237849863133,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0007375666493237083
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.0061590533339769245,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0004920915450177505
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.02658430393615535,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016197062516045348
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.030269053997186237,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0015723260820266977
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.02217628719189859,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00108264414651739
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.031188351551050795,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018354648916057872
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.03588875149470869,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0018627048019628082
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.02663365521272406,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0013268975899176129
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 0.031709892809790814,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.00589379637253143
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bd089d646ab1d2d8d64236267798da99cccbf09
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_GEM-wiki_lingua_en_write_abstract_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_precision": 0.003641954878791069,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.000801030571157579
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_recall": 0.003014962652659718,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0005505619854307441
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge1_fmeasure": 0.0023856746059350152,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0004343793032224447
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_precision": 0.0010633847104680439,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00037205433457009767
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_recall": 0.0005105251875900575,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00013313252308758052
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rouge2_fmeasure": 0.0004938815252314359,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00013206141666044304
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_precision": 0.0031336768330716977,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0007374760132829963
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_recall": 0.002362031253209354,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.00041493245818900585
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeL_fmeasure": 0.001865391120809492,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00032336237332238825
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_precision": 0.003559126075504872,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0007918434685765267
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_recall": 0.002883544387289442,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.000520757291267085
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "rougeLsum_fmeasure": 0.0022948013067646622,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00041592013921203823
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "write_abstract_en",
+      "bleu": 1.7895417015749206e-29,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 3.71058847601332e-26
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2cd4ef6840a04b53111ec9f5d1104c9de4c1c2d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01487687202745673
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014770821817934647
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2dc595edb28e4c662063346e038ef1b29f57dfb2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795025
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01492201952373296
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b8e89930efb8a35487e2ae98c3e8f210cb34132
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.338,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014965960710224475
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014933117490932575
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..99301291e2066c95a650d549725be3d9e38a463f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.351,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015100563798316405
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.358,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015167928865407557
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eeaf42ca6731745cfa9eb9a1897af37c966cf599
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014899597242811485
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0cd64fc1eafe1a2d84d258a4f4604c0c5312d724
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795025
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01488827258820394
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..2962ed3df3ed42ea394bf2d6514e91e5b4327990
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01488827258820393
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.327,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014842213153411245
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba35eb896f7573565410ae5a2422b5e1708d3ffc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..abc9255cb376d4b2fe2d08c5e98c5eec515d5b4c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.351,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015100563798316405
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.353,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015120172605483696
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ea8ccb6676bcf5a463bf27d8e6f03414698d289
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795021
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014998131348402706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..11235aabf9ff6824d4709cfb930ea2d9af7f6d67
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.0149550879186536
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.349,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015080663991563098
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..4dd8c5ef62d14e4a2e01b1b2bc96b40873e9b115
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.357,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015158521721486773
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.35,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.015090650341444231
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7ef1bca2a802bb26c754aecb51e3ff1601e43b6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229857
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.334,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014922019523732968
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4b39abcacaffc6c43e71c83323e27a569f8e368
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cb71af92b294c2124ba71ec37c9bde6e42f3ca9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014944140233795027
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014794927843348635
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3904fd16e410012f1f726c99ba7927b8b02c663b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015008706182121728
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014865395385928373
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c4097b6147b6d8241a9216f46313ce8851e255c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014865395385928367
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014770821817934647
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1aa431680296d4b6e1170dd01cda93fb2a9fe8a0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc": 0.307,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01459328489285263
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014806864733738859
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb1696b5313f9ad15aa64a470b43a9373523bb4e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229857
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014853842487270334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d90f6268751693539ef36b259f37be7b180b392d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014865395385928366
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014876872027456734
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..23613e588f3c3058eb30bad6da03f60003aa5ce0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014933117490932573
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014933117490932573
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc770d95968336d41369b6194d665da0bd221b5e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01483050720454104
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014770821817934645
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf9d6f985a1bfc820e9964ecf4fbf16db4280502
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.313,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014671272822977885
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.306,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01458000605543697
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9827b80167f62b971e97b5fe92e909b7f86c276b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014865395385928362
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ab18a637dc3f07a243eca1247de8dbce5bd09e0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.342,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015008706182121731
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014899597242811475
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad27d548d6d8afc2239671d26dd0c765644a70f6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014910846164229863
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014910846164229863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..333577bfbe7ba21cf15ee5f33e6682aca01f608f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.341,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014998131348402707
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.329,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014865395385928364
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a9895428f6e52c6e989e9c317c97bff3384b1bd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.348,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.015070604603768408
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014933117490932575
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7d1e6eb4884a78efe2474c12f1411073f8278cc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.338,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.014965960710224475
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.01489959724281149
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f2944506e4468f9c33b48a97947c777c2224e87
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r1_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc": 0.314,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_stderr": 0.01468399195108797
+    },
+    {
+      "task_name": "anli_r1",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 1,
+      "acc_norm_stderr": 0.014818724459095526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0b96a5ebabf12cfe52ed03c6b290810bdcd0a64
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014818724459095527
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.354,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.015129868238451773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..418b063ddad7979ba00a2f1e110de615cd4e6703
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014910846164229873
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01488827258820393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c319ff5181f138260c9a330e7b5124980c55f095
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014734079309311901
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014709193056057142
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5c6357d38e10ebe5581c57678df2f779ce90bbb
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014721675438880215
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.321,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014770821817934645
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8034f56572a02ce27bdca63186f885d4ef3c4b98
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792508
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014709193056057127
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c90b8c46ee4428fdc7be25e39d79d857f866024a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01470919305605713
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.313,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014671272822977888
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5e80843f1089a62b27c0f783e57714a7e02c7c1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014944140233795021
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01489959724281149
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7eea77c40874835bd352519306ea88781ea4970
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..13f8794de631b10f163b9764dd83cd983ba2f6dc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.311,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014645596385722692
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.305,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014566646394664385
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e518a00683bd050802c7da2909c42566ab0ca173
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.292,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014385511563477341
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014498627873361428
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..248eb4377f22bfbfe5abc2c035a4636ba71d353d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.303,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014539683710535269
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014498627873361427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac9ca0e60536587c3c4992fd8fae7116a607df02
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.296,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014442734941575018
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.3,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014498627873361427
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a3eae582dbfc9f1282aca566860af640ac907c9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014955087918653616
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01491084616422987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d02d5ee9704753569ef36cc2863ee1e3c244efa5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fea95a3389aa6ceaca618ad396d272aa8e3dd864
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014709193056057142
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014853842487270333
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..429b69d6a8dd12f310efdd663c36b007dab6735b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01488827258820393
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014853842487270336
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c0cc852f618eb0ee00adbcd2427c7a25a87d966
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014888272588203928
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01485384248727033
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1edcb588f3c22b54ec20a2a643f99ed233ecba47
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014734079309311901
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014830507204541042
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..efce3a2ad58efb4f476b548b3579697935e96bb1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.332,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014899597242811483
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.0149550879186536
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dda690bbeea662a8d9c4464bd968a4349900dcd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.314,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014683991951087974
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.312,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014658474370509012
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d88e821da16772e44f4d75af3c94dc03b1e4bdf2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014806864733738864
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5afa2c68c35b5c2478446a32eb09ea67d65bc8e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014794927843348635
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.317,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01472167543888022
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2db65176c6587c4d13af2e5e084429c1893d597f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.324,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014806864733738859
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.318,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014734079309311901
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb8f28a04e451dbb5c5cc7cc1ef150c8220e81f2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014933117490932577
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.339,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014976758771620345
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..161e15dc32c1e6604496d5cc3c002d7263b06681
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.015039986742055233
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01491084616422987
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5846f760d6c2d135f2d4b775c9968f1f685754b8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014696631960792506
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014696631960792506
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..98b555e95de8f531815cf0671e8f46d91e950b3c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014794927843348628
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.331,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014888272588203922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a119197deee724bea03b9c4f1d0796a82e1f3e7
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.337,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014955087918653595
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014818724459095524
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..db0860aae0681cf559bc5057eee021ed95342ef6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.323,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.01479492784334863
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.328,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.01485384248727033
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5dd4c134bf16c843746d2b256153bcae03d38839
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r2_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc": 0.316,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_stderr": 0.014709193056057121
+    },
+    {
+      "task_name": "anli_r2",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.326,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 2,
+      "acc_norm_stderr": 0.014830507204541042
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d8c66e02c04ca370d26aaf3217eb8096a5b239f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013562032919529019
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.34,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013680495725767785
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bc748d653bfe3ead2f380f9a4bba35f7e1e2b5c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.345,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013728421539454876
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013696658778002519
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..46602bdad8e92005ad25d0181ba3ec42e1e7a3ce
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.31666666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01343407866082739
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01350837286730022
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..25858f2f8169a9aa5e6e7b9d10d7fcf570087091
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01357080625843363
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013544340907003663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..066a695ba3780539b0334913e50ca1901ac54f9f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3225,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013499258621103249
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3333333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013613950010225601
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..642d3797c722a26f72f04f979ae88afeff0313ef
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013630871843821479
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.3441666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013720551062295756
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e23d3c7b0e445faa249af37ed9dddce204001d7b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406389
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013526454480351028
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a531850e6ac99a1c3a9a6b11d547d5bccc88ed5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c84bba26aadd335f65eb71ca8576d671c8127c6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3258333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013535422043417459
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881638
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..93724da993f3cd16d63e093d76f9ae40ec97c137
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3258333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013535422043417464
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013471620929769144
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d96e81fa78fd172786f4b5a0de8ead0702d0968
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.3233333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013508372867300219
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01357080625843363
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c467ebe75af50a2fed99addf1ef3ed5969c8e79
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.31333333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013395739415639082
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01347162092976915
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..af544828e96ded129bf0c47286353ff8ca474ccf
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013696658778002519
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.3308333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013588208070709002
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f401beaad0dc5c03785b93eeb2697abaf47206c3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4c241ace54e3d42a99c2709ae57cb9d6974973a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013526454480351025
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.30833333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01333672114313647
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..67cf8910890108b2aff46570a92f1cc2ed791853
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.3433333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01371263383046586
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33166666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013596836729485166
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c25d0725c4faf0826307639bc9952c0b5daffe9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01363087184382147
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.33166666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.01359683672948517
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b6cff9592756db3199a9417e393fc3382c313e4
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_can-we-infer_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc": 0.32916666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013570806258433618
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "can we infer",
+      "acc_norm": 0.335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013630871843821476
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..497b0bc5ead898a9c773db5668800e1517835fc6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013544340907003667
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.31083333333333335,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013366457845965445
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5158fe0e478db81947e1742e4fc215018191fdc3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01357953127780092
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.33416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013622434813136778
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..610596b7a9b99cf539e4e6c960e84ba529fc93a6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.3225,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013499258621103245
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.31916666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013462309712005134
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c76a3e226924ea61f406797b39ab9a665ef0aed7
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.33,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013579531277800925
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32083333333333336,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013480882752851557
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..58d6fca480ca50d09738f02140975925462ed5d1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.315,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013415009084004866
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.3125,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013386029277441229
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b2b427a8be40ab71c3a40d5158200c241e8f322
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_guaranteed-possible-impossible_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013471620929769145
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc_norm": 0.32666666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013544340907003663
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..08c4054d78f0b1a5b614faac4a85c30601033294
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3416666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013696658778002519
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013605417345710528
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9142a5e74c3e7610ee034408f6dce98fbf8171fd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013647602942406393
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33666666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013647602942406393
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..aae16a64d6598ba12a79f66787f256c423b32101
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.31666666666666665,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013434078660827384
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.30666666666666664,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.0133166423190707
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4b80978c5148575b8bf6a83ad209d53ad654dff
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3375,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013655897185463657
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.32416666666666666,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013517438120881629
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bbb3c412542edefc9bd9b5fecb7ca8da8482322
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3333333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.013613950010225603
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.3283333333333333,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013562032919529019
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb0752cc78f112def648cbf7f376dd52eb1c1798
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_anli_r3_justified-in-saying_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc": 0.3325,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_stderr": 0.01360541734571053
+    },
+    {
+      "task_name": "anli_r3",
+      "prompt_name": "justified in saying",
+      "acc_norm": 0.33166666666666667,
+      "dataset_path": "anli",
+      "dataset_name": null,
+      "subset": 3,
+      "acc_norm_stderr": 0.013596836729485163
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..32415243640e54408d76654b370450a2c60fdae8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.22696245733788395,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012240491536132873
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.22696245733788395,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012240491536132873
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..aaeb0f4870a69afe6d17fcbfc5c9484080d835f8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.22098976109215018,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012124929206818258
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.22098976109215018,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012124929206818258
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4402f7cc78865542a3d8774f859b92e83f1819a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.23378839590443687,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012368225378507135
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.23378839590443687,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012368225378507135
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c4d74ea78fcaf762ae3d5404c623311fed2bc09
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2295221843003413,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.0122889267608908
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2295221843003413,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.0122889267608908
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c89c9bbc58021481b4ac67c3600d14577d9c9009
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2508532423208191,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01266819862131543
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2508532423208191,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01266819862131543
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..190ea1cbdc632cde7d38f0e323c2462ac60db528
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24573378839590443,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.0125810334537301
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24573378839590443,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.0125810334537301
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a98b71af4cfdd57925fe0f88ecf084609028d07
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.26023890784982934,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012821930225112556
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2901023890784983,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013261573677520773
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..42b4ba5cf53344293cf6ddb8438db8ed768f92aa
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2568259385665529,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.0127669237941168
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2960750853242321,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013340916085246263
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e50736776cbd5c0fcb2915a01a745d48982be9fd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2696245733788396,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012968040686869154
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.29180887372013653,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013284525292403503
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..38d3745daf62edd560b61e60c892e7b45d2f4aa1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.27047781569965873,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012980954547659556
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.28754266211604096,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013226719056266129
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d921ce881140dce53a027eef0e42ba733d976161
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.26621160409556316,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012915774781523223
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2909556313993174,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013273077865907576
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..060ee1f877587c2f3063a08ccc4ede3bf56d71a3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.2508532423208191,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012668198621315433
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2841296928327645,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013179442447653887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdb1659a05c35ae3575232a5f05f584c4e54adfc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.24744027303754265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01261035266329267
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.26535836177474403,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012902554762313967
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f75ce6ede14294e4ce4be6a665465ef261828d9c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01230492841874761
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.25,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012653835621466646
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..73fe5af6f60487cdd1d13ec2d2e67b81a8cf2157
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012256708602326917
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.24061433447098976,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012491468532390578
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..648822727417cddaaf516e810b15535e42578e3f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.23122866894197952,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012320858834772273
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.23122866894197952,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01232085883477228
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f60568ce8a119d283286e390e17acb1e49cbb6a5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012256708602326914
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.22440273037542663,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012191404938603836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..263137a1b8601bd304fb25b92078de887ba15e7e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2235494880546075,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012174896631202609
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01230492841874761
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..14de5ed8e63990c72b2cbbfe54ff372dfcb6b24b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012304928418747611
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012304928418747611
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4dba00191534a1428c2cfe4d496a47d99a96cad7
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012272853582540794
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22866894197952217,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012272853582540794
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..36293476286cb6d4cc3417bae57337239658048d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012304928418747611
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23037542662116042,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012304928418747611
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f3cd799c015a59751441b00479f11894a4b7d53
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012256708602326903
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.22781569965870307,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012256708602326903
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d3e5f05f126a190ea428ca9d0b40820d32f31d5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23720136518771331,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012430399829260861
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23720136518771331,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.012430399829260861
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..42209da41f97eba1b653e693199a00d6a0fc9a34
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24232081911262798,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01252159329580012
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24232081911262798,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.01252159329580012
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..470e37b4e419d88b7ef368eb7cb20cc4d95cd2aa
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2636518771331058,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012875929151297065
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.28242320819112626,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013155456884097218
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6613cc82353bcec2086f622e68934f5816bc3273
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.25597269624573377,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01275301324124451
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.29266211604095566,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013295916103619413
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..614903cfda61d909b6e131ccbba3ae3983b8a167
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.26621160409556316,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.012915774781523231
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2858361774744027,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013203196088537367
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e13fb6638587639f406393658d5dc88999d20ff1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.26023890784982934,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01282193022511254
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.28924914675767915,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013250012579393443
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad1a7beffbf31ef5ad3f19dbf7043346ed4ab1a1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2627986348122867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01286252317535133
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.295221843003413,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013329750293382316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ccbf6de87abbc77269b5835b96c396bf75d2112
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_challenge_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.2619453924914676,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_stderr": 0.01284905482685812
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2841296928327645,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "subset": null,
+      "acc_norm_stderr": 0.013179442447653887
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e7878daa19a84d83bf6857cd4bb7f446f5932b7
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25252525252525254,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008914948991495706
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25252525252525254,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008914948991495706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5896d2ea67fb0d1a94f73efb1a9e49734d6ee735
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008844984581934903
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008844984581934903
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb28c350468178df0e3fb6a727957f7e347057c8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.2588383838383838,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008987501845758058
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.2588383838383838,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008987501845758058
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..817fd22cc4a81d00c90e26325255651ca2b71674
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008844984581934903
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24663299663299662,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008844984581934903
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1e24f2e46694dcfef8bb3ebd3e426e80ecfd7d05
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.24368686868686867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00880917174472056
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.24368686868686867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00880917174472056
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3c4b86e06a2a037e4ff31490b03e4269814b54b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_heres_a_problem_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc": 0.25336700336700335,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008924765424529264
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "heres_a_problem",
+      "acc_norm": 0.25336700336700335,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008924765424529264
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..86be4fd9709ea0d1c2ce2ea5a2c1d44c117bd7ff
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.375,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009933992677987828
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.3164983164983165,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009543851857323891
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..386210ffc029032c3d04096c1fb3a1734b8d8bcc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.33585858585858586,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009691180932083506
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.30008417508417506,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009404000558513351
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8148e610d36eb28268e61bcc3bd6287cc6f58250
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3261784511784512,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009619849417035182
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2887205387205387,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009298805565435511
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..404b8161b61d3f8f7e48f7439ac2f415ad33747c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.32786195286195285,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00963258707617002
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2828282828282828,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009241472775328228
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d5d2166cf0b86e55d575265e9bf26322ba5417
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3164983164983165,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009543851857323888
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2904040404040404,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00931483330293628
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e794a4ccd7bbf5b107b034a04d5fde17c9f199ed
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.3148148148148148,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009530150430975598
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.2937710437710438,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009346423298166722
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a31b94805528613e8929c90775d225408bdad3e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2840909090909091,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009253921261885763
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.26557239057239057,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009062210626971845
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f17fff02fc19423856f423cf786acccaab1b3e47
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2638888888888889,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009043789220055136
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2756734006734007,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009169229476542565
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2892fcda1938e3ce1f902003fdd8f0d95f8fba3a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2668350168350168,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00907591585926727
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.2807239057239057,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00922052617471136
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e83f0dc6d085cda1ddfbb7d968bcd28131b4130
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.27230639730639733,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009134218447652678
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.27104377104377103,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009120919741760597
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f8852cfae52142c0ae07db9f5403d98af825a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.26262626262626265,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009029861776763752
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.265993265993266,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009066789565615694
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b85f9c831ca4f4077d0472566ab337079517ede2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_multiple_choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc": 0.2537878787878788,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008929657065808293
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "multiple_choice",
+      "acc_norm": 0.26346801346801346,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009039157374497715
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1a034010d887652d035bfe4f24bbcf1b616dbff
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2537878787878788,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008929657065808295
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2537878787878788,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008929657065808295
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b45d24a47b4cba035175503a21835cb9f4cb669e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24368686868686867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008809171744720559
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24368686868686867,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008809171744720559
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc6dd4cbb8217646a64d15151f1381f89edf14d8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.25126262626262624,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008900141191221648
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.25126262626262624,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008900141191221648
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..401eddb5af36cfc68a036c29a4056dffe2a17d6a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.23947811447811448,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008757032594354034
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.23947811447811448,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008757032594354034
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3df8da55416d6d6f165e48c80e4f3ee9fb0f6d9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.24284511784511784,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.008798836444222033
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.24284511784511784,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.008798836444222033
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e91bfa49a8dc2595306d917e8d845233c2ae938a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_pick_the_most_correct_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc": 0.2474747474747475,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00885511441483471
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "pick_the_most_correct_option",
+      "acc_norm": 0.2474747474747475,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00885511441483471
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..35a624caa26545803c3a72a7a4d007e2d225f042
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3472222222222222,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009769101679700909
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.30934343434343436,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009484615220606831
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4420dd8b3dd3803d5508a3a7da6fb1ecf2d0b58
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.32702020202020204,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009626235849372194
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2916666666666667,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009326752065621162
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ca6d239bba700666e4d75d72e89d9d4ce54429c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.3287037037037037,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00963890316702216
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.28324915824915825,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009245632200075455
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd14f64b7d08617b433cbf1209fd2c4531898d0e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.31986531986531985,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.00957082182057359
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2845117845117845,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009258050925618821
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..648fe5329e5a24a2eeb667e95597d11dbd00c838
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.30892255892255893,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009481048387761353
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2904040404040404,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.009314833302936282
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3346c4378714c9b8eaf0481f5e926847b9731307
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_arc_easy_qa_options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc": 0.30723905723905726,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_stderr": 0.009466688832475374
+    },
+    {
+      "task_name": "arc_easy",
+      "prompt_name": "qa_options",
+      "acc_norm": 0.2904040404040404,
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "subset": null,
+      "acc_norm_stderr": 0.00931483330293628
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f3b7ccc3f36809c655112787f1f41743a9fea43
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5883333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008986619341172336
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6263333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008833986042519329
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce5ea66fd6b853108d91da147a1a59116daa7e79
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5896666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008982215188519145
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6033333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008933122315228992
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..570e9c5a96f4e671b813b9195647efbbc14f9440
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.587,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008990955404907169
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6156666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008882569490543049
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7cc0fd86c65465410317d87d138e5d0cf82d8f40
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5986666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008950698369218394
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.611,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008902401412932078
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4914721ff31f1f83da8c97188e474e9fdd19725
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5913333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008976614094836194
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.6086666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008911995272576809
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f44c214dfc3907855d611a5cd40a604012d68a6e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_GPT-3-Style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc": 0.5966666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008957972256087361
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "GPT-3 Style",
+      "acc_norm": 0.602,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008938230472973836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee755ce7b7736eb1525f624a3411efed6c05996b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.605,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008926639623340282
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.37633333333333335,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008846558976258924
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ca660280e9b3d51d874009b802994787305d1c1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099982269204863
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099982269204863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..27dc11f276f2fd36b492b2d8588f45a2caec363b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5353333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009107405418833935
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5333333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00910991912725527
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbe05fef440321f4203a696dec3bb9fc241bf770
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.5136666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009126819837938642
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.5043333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009129888226428837
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..efb91c65368476947acbf7788cb15291fc19839b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.49266666666666664,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00912924906387328
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.48633333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009126819837938642
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7af939ece170f048c6ce8472f9ea008858d01d3c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_after_reading_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc": 0.4836666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009125358337932443
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "after_reading",
+      "acc_norm": 0.47333333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00911723665908298
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1c328867a205316dd162f859a6209818843cdb6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008846558976258922
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.617,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.00887674483503322
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ecc505b15967111111b4c4139a4b87965cc4ed0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5426666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009096928229880423
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.542,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009097962646004976
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9addb173cff1f5ca3864ea0df49f5153ed424818
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.5266666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009117236659082983
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.521,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009122174705469926
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bdc8e6e64e48ac903fa79be480560e652963f51a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.515,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.00912612159491215
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.5076666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009129157751283578
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eb6ca0118b9f60fa87b8bfead07b3539165e6d0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.506,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009129573723461864
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.495,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009129774600800658
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..03187781321029c8e562bff77ad169fc1b684f7a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_exercise_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc": 0.497,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009130066778130833
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "exercise",
+      "acc_norm": 0.4846666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009125936876338593
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..585fc0d9556f6f79cbbf465b322a390801a03332
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.44733333333333336,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009079439381402937
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.37566666666666665,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008843442555522142
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3974a6be2b052b444956c9d3e727408e6e2cec09
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5413333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009098980657278165
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099982269204863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfc2f118f20bf0cda6ce4dbbc310a6eb63b4298d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5446666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009093726495969151
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.536,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009106534814375938
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c07e15bf93c03144f5a9561ca59e138b0dc6badd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5283333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009115560243539177
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.5206666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009122428543456457
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d30960b600a59b1fad8840200faea52c3cb3e38a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.512,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009127601238448371
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.496,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009129938951699211
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..b48549f9d2d674f22eea9a61bdb640e6f7bfd76c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_valid_binary_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc": 0.5033333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009130028227490719
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "valid_binary",
+      "acc_norm": 0.4826666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009124743220028738
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..53856cc2f7a9b1df50e62ee3e77d0706ac1b67ac
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6203333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008861873799148993
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6236666666666667,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008846558976258922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..12435597a0442254897be0762965c31907f7f2de
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.541,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.009099483512819305
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5406666666666666,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.009099982269204863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..72b9be931ea3fb814de9b1bc7383628acd9348a0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.592,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008974343780026194
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.5943333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008966262991425923
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..01b921251022afaf6b5885f0147aef8fcdbee242
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6033333333333334,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008933122315228994
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6043333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008929245712536294
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6606dd9c749be4caae6fb4b4fe23dbf3f905e2a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.603,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008934405848700118
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.616,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008881119942353995
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..685448d316a8f345d65833ca05ede88d915ecac5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_boolq_yes_no_question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc": 0.6103333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_stderr": 0.008905164372580982
+    },
+    {
+      "task_name": "boolq",
+      "prompt_name": "yes_no_question",
+      "acc_norm": 0.6143333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "subset": null,
+      "acc_norm_stderr": 0.008888323636208593
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..99fe3b8a43b9762d7eeb12ce86f3a55944feb499
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.3392857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06384226561930825
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.18571428571428572,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3a28bd013f167a3f43057b8a3b6311181c5fdd6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.2976100628930818,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..83c06d87bd6b9a81017a029c6ca6724dbd535350
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.375,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06527912098338669
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.28595317725752506,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b64b85116f7aaaab93b9fe317773505686757058
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.30057471264367813,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfd8554b3eed41999bfb19b32efaa79e8bf54d80
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.3228070175438597,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..72f2d912fcc9848f20ca7a1cf92c66f0f35b1c54
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_GPT-3-style_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4107142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06633634150359541
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "GPT-3 style",
+      "f1": 0.29455848810687524,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..508ac1dabe62b3670eddc38a00b2b33cd49eb18f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813057
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2850877192982456,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c107301cba7ace358589d9d2a2cb27ef9da08e97
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d808f3dacb4cc508655721153d310e82d3c63b6a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.32222222222222224,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fcba4a3e32d349ec8c7e53c5aff52f556d734805
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.42857142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06672848092813058
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.23986486486486489,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a163c90b0b4bc4bc8dad2c7bec649815332a375
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.2660493827160494,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..02e432d44abd7263a71ecc871e794fce039c8d80
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_MNLI-crowdsource_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.5178571428571429,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06737697508644647
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "MNLI crowdsource",
+      "f1": 0.25267737617135205,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..75b4ec2cc53dabd3f54f539e5bba078acd7fa664
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.067031892279424
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.30278191753601597,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..291bf3f066cfab47a0e2a9c48c32dd3bfe3fed6a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0ceae919f18d14177de639efa17f8ec55fcf823
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06724777654937658
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3421052631578947,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a6d8f2c06b9679a9fbc6856d0e3558bec5f1a3d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3519445514054678,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f39427c0cb9894f97cf5d775072bba3d68708efc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06741998624632421
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3485871467866967,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3aa16100ceca88cee74b334a92288383412cebc0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_can-we-infer_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "can we infer",
+      "f1": 0.3333333333333333,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6cc7a05fb7fea066f8b270eb4c8e186f855a922
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.14285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0471841613625583
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.11942959001782531,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1490aa026aefe063a86779111a49c6cff9fb0ee
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32142857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0629736228905634
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.25043478260869567,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a404f97d218a2b8afcdca864337964748bf97ab2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.30357142857142855,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06199938655510754
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.2763645998940117,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fb4b15a2d4b4d9805bb52b9676d28fad520efb4
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.2857142857142857,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06091449038731725
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.23859649122807017,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9a7fca14b93d8918a6d3d7a31fa57a0cbfde887
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32142857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0629736228905634
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.2299145299145299,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..308765ae172d57d37117f9538f3207d28fd7ad8d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_guaranteed-possible-impossible_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "acc": 0.32142857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0629736228905634
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "guaranteed/possible/impossible",
+      "f1": 0.24952959907367203,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1a9280bf117f0261942a2ab9972230aa2292f54
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_0.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.28359788359788357,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eca6c37272094e681c3390bee57152fac0b71a5e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_1.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.39285714285714285,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0658538889806635
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.2842025699168556,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..465929c3660475582b22dcd037f74831144705e6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_2.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.06703189227942398
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.325725338491296,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6ccf709422abecbb416fcccd41031de2bd15fb5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_3.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.44642857142857145,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.067031892279424
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3237591332829428,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..be1a93583a9c16e655592f9bc5f367d704c14a54
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_4.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.48214285714285715,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0673769750864465
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3404040404040403,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..69c7a0e375f440d71983f3a52f70a7398b7e5923
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_cb_justified-in-saying_5.json
@@ -0,0 +1,33 @@
+{
+  "results": [
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "acc": 0.4642857142857143,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null,
+      "acc_stderr": 0.0672477765493766
+    },
+    {
+      "task_name": "cb",
+      "prompt_name": "justified in saying",
+      "f1": 0.3253272334477062,
+      "dataset_path": "super_glue",
+      "dataset_name": "cb",
+      "subset": null
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..eac196e087b40a2fc48eb46f27f3c3ec4ddcaabf
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.65,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.047937248544110196
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c71f3c14b80503da5798b369fd2bba8e4057bd8
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.58,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..65e8e490d3491343c28b491e4a592c49c74ace69
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.57,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050251890762960605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e2cb265c999b02b08f9a9d8c9fc7136055a72e6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956913
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ee73ce2e093954892c65265763af2ecd38446ba
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.54,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.05009082659620332
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..884c5e0d523287481ebd5facbe792db72a26f527
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_best_option_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc": 0.56,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "best_option",
+      "acc_norm": 0.55,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049999999999999996
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..451b1a4598387621201d20e2d81cb16b8599dab0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.62,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.048783173121456316
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050251890762960605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..50ca2ecd00b5c8f9c902096d887f51c07be19de7
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049236596391733084
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..663beaf8444927e8634d7dc705118ecd40e57c2c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e803394acd2bf82f15a49fa223d2d69e36243d89
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04852365870939098
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04852365870939098
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4920067e49dd9131b288ca5146820f70bdf1a7c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..51429ecc2f921e5f6f49a0eef5cec3ebc94a6160
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_cause_effect_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049756985195624284
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "cause_effect",
+      "acc_norm": 0.43,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049756985195624284
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f64e52de370bf1247ebdd79b5c6d6c7e7ffa0ad
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.59,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04943110704237102
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c9d43ef34aac365c860ad6a604687b838b3784a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049999999999999996
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.049604496374885836
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f4016db851cdbc9adce003083aa101b4437499f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04902071300001974
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04852365870939098
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b028f25ceb81af6eff37b54e822a3ade1b47e7e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.048523658709390974
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.36,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04824181513244218
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3b4a83bdef7a0a5bed3e7b0abce3f8992cefbc2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..03532559393a88f849469fe510c83aeb17f38156
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_choose_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc": 0.38,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.048783173121456316
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "choose",
+      "acc_norm": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.048523658709390974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..387a50c79532c0f8c30cf94240c569bc023023f1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.55,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04999999999999999
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.5,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.050251890762960605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf4d0772ca6a95bfa89f6be57ab9c5eb7dcbd4d1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.44,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04988876515698589
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.45,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bef52e5d6d569a7d341ed269435bc3ba4fcea9f0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f77be672c9ed3885ce51477271793af672d99859
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..19aa36a663e0621d512c7b70cee871544b805b2f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.41,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049431107042371025
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..93a8129662121a79c846dc1eef1dc802461bf38a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_i_am_hesitating_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc": 0.38,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.048783173121456316
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "i_am_hesitating",
+      "acc_norm": 0.38,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.048783173121456316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..36c62d49caebd97eba755abe89f79c582bec03a5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.6,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04923659639173309
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.51,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.05024183937956912
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..59b768efd9fbcceb342abca0f1e0a65c7c80daaf
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.47,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.050161355804659205
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04960449637488584
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..588efb908f9566d47a8e643aa353ab958fc7f791
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.04902071300001974
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.37,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04852365870939098
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a16fd69f9ec94c6deebab215af84cdded9532533
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.38,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.048783173121456316
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.35,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.0479372485441102
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..87e37b6fab7c2b1076bd203be2e9a1fc4236f6cd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.42,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049604496374885836
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.38,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.048783173121456316
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b804ef305cab4ae6a3d41d006ff759b6e86ca05
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_copa_plausible_alternatives_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc": 0.4,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_stderr": 0.049236596391733084
+    },
+    {
+      "task_name": "copa",
+      "prompt_name": "plausible_alternatives",
+      "acc_norm": 0.39,
+      "dataset_path": "super_glue",
+      "dataset_name": "copa",
+      "subset": null,
+      "acc_norm_stderr": 0.04902071300001974
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ac694bf09e8b8717f6df171971c63f5d766455e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 3.6336092621190903,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.04963488024792009
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.43315596188440153,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0020658767170531234
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4957166938261749,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.001866401578793264
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4535276083867647,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0016078121139671377
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.1849292195431834,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0011286695386874615
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.2168971083308747,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014173517714469036
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.19529164267883087,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010857096664964741
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.3234464772574325,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0012239406876133328
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3815922019580204,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.001958691690626948
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.34319575393248214,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0012177223555536177
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.3387500414303267,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0018088042362394187
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3884662275886484,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0017571209037241864
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.3549101277176399,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001508528726933971
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fab042a1b806df03dc1a149777d30232b5c9f40
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 11.497966951726749,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.20969517538787533
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5632434121689073,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0035151496739973145
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4271980530284272,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003193464490019439
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4604743115551077,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002650137715023285
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.2697550180919695,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002821745940147497
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.20055621808744717,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022326904923778744
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.21675097190154122,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002130665758231791
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4118615721075579,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031358370899496947
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.3089968540225795,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025812304907704814
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3340759170486623,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022699367889290594
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.4622969291805104,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003380992090862591
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3487092046094382,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002878950898546305
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.3766682772648986,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002537216273666152
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe7966f8653dbf740202506c0990ccc0e1a9707c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 13.46002251789312,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.17779419634920132
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5942518719864448,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032186316753523097
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4643183570824007,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029744455358002987
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.4956710429402688,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002287948952720807
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.296024603420829,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002794307645016215
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.22746918536800087,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002276601310067811
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.24294338474194113,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002102733770788168
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4375809918999564,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030519076981421716
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.33805697661700573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002492306655427877
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3620142334272749,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002119015868595214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.49322571419356365,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032440445435041525
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.38313830841766866,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027640726437621606
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.40982719923359884,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00233324427012507
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..03c806de64f59bb7e0e4cca175e09680c2ea8a45
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.228668386742754,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.15741694271607098
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.5984788690378352,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003197560184234838
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4725742646775533,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029898036325022134
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.5030401775192643,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022871900658989718
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.3022078979155364,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002768059681464576
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.23634888463371143,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002364216942607757
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.25125664699332345,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021594431913264894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4402691357265774,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030458831253925003
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.34438265485639696,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025298846029330625
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.36768240781122236,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021719131443325925
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.49682458630116,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003197442546788019
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.391344468013188,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028109879310869203
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.4169702193831535,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00236830852153733
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b4a82f5efe1001328d721df7bd6a58cb4a0ebf5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_coherent_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "bleu": 14.597195611135092,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19311940839672206
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_precision": 0.6033209977447119,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032262604730037233
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_recall": 0.4748762815970977,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002945258001815744
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge1_fmeasure": 0.5067540268822891,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002283857469804599
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_precision": 0.30663914951234,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028252371140640954
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_recall": 0.23823746120517744,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023749274600023823
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rouge2_fmeasure": 0.2543335770489735,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00220496267618016
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_precision": 0.4445418586714487,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003042523823125125
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_recall": 0.34796937168289555,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002576869804389209
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeL_fmeasure": 0.3718070181457613,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0022081793332601567
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_precision": 0.5009193423101975,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032015130321828705
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_recall": 0.3942571507375127,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028270235680156602
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "coherent_text",
+      "rougeLsum_fmeasure": 0.42072079174732374,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002392828914020526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b95e17d83e6e1d6556539cad80f6c32b1a312ae
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 1.4830980923039687,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.021118997157981167
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.16033400703017334,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0026324818883430674
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.26969890616718895,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.00300612512619589
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.19411260589173338,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026931921934754146
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.04108732076700223,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0013148092674986009
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.060179100405235525,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0018083443739043902
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.0471164796345766,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0014453353118279637
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.13141266843169974,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0016840231738956485
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.23233998433211528,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002079592569134541
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.1623047778451774,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001733234289363297
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.12413304341622819,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002193995817408237
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.20741850668045922,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0025905538539559496
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.1497420581979396,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002264568864352182
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..afaa1844e67416ac7ee6ea53ee538602c1fc581e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 11.052875734645225,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.14880854986446493
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.57014937693693,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0034678949077726964
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.42698229782553804,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003175445550377141
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.46240711895501474,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026000155597878864
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2687730297236951,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027746644235063494
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.1978323547353253,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022114672007333223
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.21455453622601164,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020999880904874935
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.41381444937572104,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003104864621244597
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.30576495763203765,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025267141814989897
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.33247489589579843,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002211720556331402
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.46455549393143064,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033363704023788546
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3457433020096904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0028302467398599804
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.3752913090337666,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024768066693497694
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..37bfb707e0583ad164cdd3ee70fd321ede922168
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 12.991357667842433,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2148940573425749
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5896158988648285,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003227466696309816
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.45591912216432784,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002945187087829095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.48878419234716897,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002280346300880379
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.291483506564891,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0028136818980263323
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.22065139023543603,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.00223563856964428
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.23698600371386896,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002081490451850364
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.43257854015901653,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030534619486731036
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3305654008256819,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024648687370697397
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.3555024478259199,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020958132366617455
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4864701775008372,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032498731210847303
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3735813004719031,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027186299763853495
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.4013706220535876,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002300602644643578
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6e1001072f665565a6c021eb203c716d8fd4c34
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 13.762369274263193,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1239176716959734
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.59429608818623,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032250742001364
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.4665961665554913,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002964182169864703
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.4972877840918587,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022666769876385283
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.29729315956239166,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002800668893654478
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.2299667816072873,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023293965476642994
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2450997873827247,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021372854101083423
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.43693224172608097,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030744702439597735
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3396715433163892,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025008213475653076
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.36309320543932994,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002143455970205985
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.49167275767919666,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003227991912008724
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.38503302211525176,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027832705857527332
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.41071083444612827,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002345494497419325
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..acab81e93ff541452618f93255b058d7c2a7e8eb
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_create_text_for_me_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "bleu": 14.428600064889515,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.15433340933184303
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_precision": 0.5943537947777199,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0031999293554169036
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_recall": 0.47228487122340007,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002956708038848888
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge1_fmeasure": 0.5025477379055636,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023113153921818035
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_precision": 0.2998475825029556,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002791055852445239
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_recall": 0.2352088394682515,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002377332758985399
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rouge2_fmeasure": 0.2504150328382754,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022100011211586643
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_precision": 0.43594043199679045,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029783466717854096
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_recall": 0.3443040962007768,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002545103701911904
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeL_fmeasure": 0.366995310341045,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021887174581775594
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_precision": 0.4931294642744763,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003159722931878829
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_recall": 0.3914753200381526,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002813609189039384
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "create_text_for_me",
+      "rougeLsum_fmeasure": 0.41671475645067774,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023914867754907575
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..620ff7458156db5fcfdf55b5bc925b0a24e1c20f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 0.0005117932625447547,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 9.536197427966122e-05
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.07240788007030449,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004022247456753345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.018253220521184315,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0009033239887635698
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.022954814700845268,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001068325564862084
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.010383184176156,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008601583355918767
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.005509807793947819,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0003484284415338221
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.006565521451443904,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00039997706182641485
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.07151594361458591,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003994235701988043
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.0178020630508808,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0008766434342585233
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.02242271358700061,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001039923095268228
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.06627554096777186,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0039461603525234356
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.014308412178016428,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0007253437526843341
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.01827124494666003,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0008646161286213295
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bd762e70a9b4cc41f67890cf12e6d4904249831
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 4.721645081870147,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.11055575784993
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.2724718400705274,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004738978458220745
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.22483884027045697,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003941300807919795
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.22193867861495062,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0036201064747419113
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.1110669378591428,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002415691980156218
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.0998953071293873,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021186084361931204
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.09765121722587221,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001961865183877688
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.21699291071030072,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003963831108261732
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.17178828430922233,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0029026558236146843
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.17063146146283567,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026520595762520027
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.23219225461340648,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004261782115439799
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.18734037977237278,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003306476925594047
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.18546274919682199,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0030526937038671213
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e238b54501a88b6ac7ee45896b76882b8b645f6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 7.330361926611788,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1393713297511099
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.397688884843428,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.005243232044302793
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.28953345540286196,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004270611999067108
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.29951823566096936,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003943331079304518
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.19106018811512912,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.003494466457298735
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.1386335472956288,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002514572800352678
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.1420081261904641,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002358304787932758
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.30864554668132316,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.004308343244325185
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.21603676807123937,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0031178212713556404
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.22492458878541202,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002857976437028613
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.33293896422770386,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004658899774399506
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.2396948856097552,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0036140816121463643
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.24818144922627508,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003358374241363087
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..464f0ef8236629689cac93a2f06789b392573115
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 8.582824417171965,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.15758544085957477
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.44237363313007766,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.005197770565781296
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.3256644422872396,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0043344718084630095
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.33914919408530725,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00398888812190965
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.2164451576020569,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0035403319127956744
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.15653074707623968,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0025939385453826597
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.16128257662655165,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002432579571236265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.3325732643672117,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00412359018355362
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.23863001496343497,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003169633829427399
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.24925358955339996,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0028737162375128585
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.36169334864492975,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004501482660071617
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.2656809328179925,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0036603625436697033
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.27632455232693603,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003375878671989277
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..daa4c06e3502df5ca18f5eea5ef9a53b14943e44
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_gramatically_correct_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "bleu": 10.418299373383455,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.143116835758759
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_precision": 0.47588547581116447,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.005023552603812295
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_recall": 0.3592386557163939,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004267887970496025
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge1_fmeasure": 0.37529427535129467,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003972066091031049
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_precision": 0.2368115010419087,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0035477130748788645
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_recall": 0.1755184404072541,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0026501024504753078
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rouge2_fmeasure": 0.18235846045485526,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0025546984319096866
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_precision": 0.35325726671965807,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.004022741316670696
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_recall": 0.2608057046678188,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003158960957102914
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeL_fmeasure": 0.2731858449063431,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0029261481157958646
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_precision": 0.3918934290122221,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004437483351553158
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_recall": 0.29433296219990573,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003646588910109909
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_gramatically_correct_text",
+      "rougeLsum_fmeasure": 0.3077556604700645,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0034362468382048103
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..033b06edbd3d76e2b5c32ec30c7a128afea6298f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 3.1932091730586953,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12204583279523666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.41337918705325344,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0052760059693995
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.2309238228257254,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032774148724919335
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.24046689564492205,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026584396733124543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.23463707329236097,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0062472430372265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.07811161999290202,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001551040325003802
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.08042598706024275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001263671869367697
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.3599428455550758,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.005381765248981534
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.179931821205033,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002310618989620828
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.18994590486023305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016519612628498022
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.39508222499421797,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005341940324256954
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.21269354787389044,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003034851683547011
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.2231487737804936,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024418581547524815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..41661b49f7324a69e36634ee50945205a7e076b1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 12.375900606698423,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19606823444326352
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.597454575764657,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032329061443020345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4434643604110928,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029889149083277117
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.48293554827595836,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023435879912405283
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2910060712314691,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027476178598673066
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.21176179817940102,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021589328308703566
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.23118450232385498,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020415023824907675
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43888436474875336,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030457086517011057
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3220136107735307,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024545810076108667
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3518994031108773,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021201781748616372
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4908609045585646,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003223896911814539
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3626278022836946,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027433883081054507
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.39555654566655024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023470162024293896
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a102562da98dff96a2b2a73c2d97472a204b453
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.369325725727359,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16247647633686452
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.6089651864273457,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003198435500725766
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4684512148933178,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029309075224427496
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5050896808334919,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022820908410676275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.31006692351264853,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002823237008286986
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23489561514574075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022593518873055855
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25360971138111243,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021297710933538033
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4528063450828851,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030647195241319818
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34593233406974633,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025245059893910897
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3737642937216708,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002190131174828545
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.5104756682075146,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032393352108801977
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39199556131381524,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027868585721285742
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4229068329404585,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023792336336982954
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d22b52b1565deb2404f714b9ebc6f0d166d84aa7
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.229236548887188,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1893298480282073
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.6019638812498068,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003154041355931588
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4769354854853059,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002958348996182155
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5094328850058195,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023113991646470547
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30884118560646173,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002746739827702377
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24294392646302612,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002367727486010716
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2589859331129998,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021744147637838404
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4461308937388599,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029602114104730864
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3521903496319005,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002557452362321216
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3764193148894467,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002186561194132359
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.5062370391801149,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003164925596063054
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4014335787816741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002844420919383205
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.428648276943535,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00241888010522652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e64f6299ae2f22ace82f856b5b9ace8fb0d4a246
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.5976388656912,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.21552554445196853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5995295914989719,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003160879006436628
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.48298984835020653,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002984598478781118
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5121277417214488,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023435839886818115
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30839025650068297,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002750823115926885
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24686659194192842,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024081433509198598
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2613413893640116,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022244910107010367
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4436724332909479,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029870463284766544
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35568244613898303,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026122906695223214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37763811148528253,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002256869338791484
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.5052162393739371,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031736753796748323
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4072510388482175,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029018213850476
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.43177980537098043,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024732350477868438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..42d342aa3027ab9578b78f9dc8bdcc1322f8ec48
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 3.5945698535151527,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.05015613229197462
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.1518335745611247,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0018826786001323694
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.3204231881776139,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003749517143648088
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.2014677219582868,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00236715942636543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.06474667748021277,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.000965958278788391
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.14330865656477818,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021929123057498187
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.087018168845551,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012616678095828638
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.13008693019278264,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0015822109164815863
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.27854183494329476,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003339816956023396
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.17348222244786518,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0020292638348797455
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.13205681807979777,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0016948884637677832
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.27959042918923643,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0034135086071623226
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.17531663258612987,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021346250443011615
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebd90aa9340d6fb1c2f7d549ce0b318a2b6ffc90
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 11.965870070374333,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16483121107829488
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.5887118747238361,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0034037823001644547
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.42863531404813354,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002945871261238008
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.46931614928272436,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023464967015723428
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.2917732877577307,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029515576898139703
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.20650048693459347,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002171478739145782
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.22727215524670805,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020921315305935666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.43843638462977585,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003267387355090894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.31435132059874576,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002449056457480189
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.34594037793676896,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021730860064039063
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.4863382057285638,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003427467174859405
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3514705730796979,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002706234060494214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.3858219923088528,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023631470234993725
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..00c280ec3507101fd72a805aca887e0ff76c0aef
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.21968762329751,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19753671493984845
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.6001939642148559,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032889319145619654
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.45215722065445574,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029376225460110163
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4896336928218898,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002272681148009135
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.3042584905990978,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029124035723080053
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22448908642607843,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022714538946666265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.24356704396149256,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021235805300181703
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.44603166342337,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0031685568327235944
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3328512396680296,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025185492257657357
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.36147096558421893,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021784713573052518
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.49791643563111876,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003299490210633222
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.37373958585789946,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002758682539439094
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.405169085603143,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002336476952595647
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..37272f652dc9ec7f7a40a34e4313131250c31e77
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 13.762757946910973,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.15395787830101001
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.6011962101235646,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003243407906957494
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.4566959902853601,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002939867346407233
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.49443903206008083,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002288175378374436
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.30545692484615405,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002828597032420292
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.22960389174017176,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023569729423993362
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.24825098805394827,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002175994450385002
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.4482976384673195,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030983880866779887
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3382220596725317,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025420156167704428
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3668711184434305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021942133007791824
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.49895735281420034,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003239442063627894
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.3789157003201587,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027965890193868393
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.4101796683834667,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002376314535157212
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8790723c1f81ac2483fe862f88fdb25f0456c55d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_e2e_nlg_cleaned_text_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "bleu": 14.08932220330059,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12549599830246483
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_precision": 0.6032747175760897,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032330932608095417
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_recall": 0.46029173412458885,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002961686972053866
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge1_fmeasure": 0.4977678289874263,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002315197828946516
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_precision": 0.30785136134317653,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002851909271826754
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_recall": 0.23142634215783778,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0023233772081180178
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rouge2_fmeasure": 0.250383581684577,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021703487973632127
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_precision": 0.44843896379060744,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00309459957521781
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_recall": 0.3397342484556464,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.00254503160734854
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeL_fmeasure": 0.3680698318578711,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002203713649900909
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_precision": 0.5007987352925727,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032114979248749087
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_recall": 0.38188336871065476,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002800647830926674
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "text",
+      "rougeLsum_fmeasure": 0.41293672055666814,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023792263326761756
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794b314128b91b0219c392bf1a341b9b8df1b23
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.1071307957238157,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0016517097059310102
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.26802963730298424,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0038166991596965635
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.15126799942015656,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0022274714202831503
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.016455460229799202,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0007123063350781688
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.04241402980618847,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001839872857682253
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.02341459296206923,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0010037294271937876
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0851420400256054,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0011652272980718196
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.214378710831144,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0027666678497373413
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.12042858816848816,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0015747379656493184
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.08583240847633615,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0013027504044229373
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.21687477112312012,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00316229908245589
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.12150553920325693,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0017784859933383196
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.7808151339872366,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09382381387908688
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..abbdf87f0535c6df6ef27f3bf0f4be57629df3d5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.11231051130110176,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0018725155871750657
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.1795623731554383,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0033273148415623524
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.12864627025251077,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0019671741638819457
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.007601036529870395,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0005382965250024377
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.01463614027423228,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0011353174654812333
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.009363085787250576,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.000657318457701902
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.08529490041926313,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0014199778984168093
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.13479472021332767,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0023922057175442494
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.09703185675955185,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0014143082322830222
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.08981383040850886,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0014661369138302548
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.14526601892808536,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0027443402153164392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.10322737003200926,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0015575341100461024
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.44746058261891863,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.04604614675806431
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ce1b2f9ff34bbea908046ec34e43edf0c51794
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.16788705210345736,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0035631862700196597
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.1859715150205194,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003718461517457391
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.1630833669947709,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029013541077108885
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.027280423968171305,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0017884680404872415
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.02861726878304468,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0016227282457088239
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.02516880091161777,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014518773666627352
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.1298654453376493,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0027792255251731595
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.14241015540538696,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002737234265271257
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.125424737042503,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021634801655348837
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.1317729164353648,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0027919825527681222
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.14675010657719567,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0030004020311325185
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.12801618488485492,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002241267571048609
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 1.2338766623936686,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0863575218315581
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e38a1ec75f0ca73a406e78047dfa81cca3a0030e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.20259935374214025,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004297260212942113
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.19902344092534727,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0040898284395801425
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.18680534135299007,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0035041138182561496
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.04047153603808275,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0022111947670748746
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.03870933901712769,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001945297453974642
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.03634684076285987,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0018252272028769054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.15363058299522142,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0033796539988497055
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.14959751674894153,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0030580637340826234
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.14103166922609478,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0027058623767703674
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.15548750806102077,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033775890032156424
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.15343744602601786,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003228331954516354
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.14344562299193356,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0027398053676387112
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 2.112211471327635,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.14836531376817008
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7642c9b292e319815c1486c6cf2f6523828f3725
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.06203252815051904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004046315146705861
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 0.05167378825092661,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0033958009117904748
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.051670093376308644,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00325478898460509
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.01609711772190667,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018101320874714744
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.012532422565499546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0012529251267989366
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.012790849399041708,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0012580759880557381
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.04843983623971503,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0033335322774442353
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 0.03883146936809033,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0025763740768868872
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.039365007752481786,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025465835991996433
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.048932832240163246,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0033502554817490303
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 0.039682203568842785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002636894081082925
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.039952964885033464,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002574589625833568
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.08776375172639635,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0254874930206879
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff9e806ad5d7d29487a8edec621c182d047d6b6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001212355466087544
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_recall": 7.94438927507448e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 5.667175161318954e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge1_fmeasure": 0.00015173505739543477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00010815684401450718
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.001212355466087544
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_recall": 7.94438927507448e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 5.667175161318954e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeL_fmeasure": 0.00015173505739543477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00010815684401450718
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_precision": 0.0017152658662092624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001212355466087544
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_recall": 7.94438927507448e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 5.667175161318954e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "rougeLsum_fmeasure": 0.00015173505739543477,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00010815684401450718
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_boils_down_to_simple_idea_that",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9bb50252c2747a28ae4216dfc4a6bb0704d9574
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.149753853048109,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.002171727132004842
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.33575386280850533,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004569585612811194
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.20327156203276772,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002713519692487557
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0373304252845041,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0013140461748437202
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.08667444780169199,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0029020162558933056
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.05115437745821428,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016948540468750808
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.11684544594863705,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0017379797448369813
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.2626066143615487,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0035996707108239072
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.15860444913174343,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021216399101338006
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.11959497456732406,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0019073480015924227
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.26906161142003515,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004008001098551392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.16243069564873555,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023749553691038666
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 2.0038553875789904,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0955856896174526
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb7b747c993f47735992c944cd6aae32ebc91dd2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.17568310283334154,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.003154566926090178
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.2735517968470343,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004496187884537725
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.1978014293145674,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029030615223376317
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.03589501457929329,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0015153906885206705
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.05948047287527727,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002405412603446267
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.04146006996703184,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0016153060489392866
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.13012656454111257,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002376680785916719
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.2033977431475975,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0034053368336631642
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.14648095189504623,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021711694446675286
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.13565628042823968,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002405886992298969
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.2159357251685773,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003778783915859853
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.15404254885643145,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0023143063162019595
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 1.8050866450785243,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.10998069507650922
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..33b658c6c40db6df223f45ed6234d5a490ebced7
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.2771488326124715,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004467706670226065
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.25485033292705744,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003967723186833506
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.2502568815425181,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0035630016377887442
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.07229011551366707,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002865914324384087
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.06603510981541624,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025917576128546044
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.06491562652621455,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0024653642015722986
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.21436486172237043,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0037773047502249776
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.19611922774785026,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0032881375175458053
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.1929431469393036,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0030168487926143113
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.215602404940091,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0037769923822126854
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.19818717656448856,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003371545970704401
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.19440288733566372,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0030350725691875064
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 3.37468537370651,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.23333263171377386
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..34cf0681b169b5b252ec2981db413ed1233af857
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.27792934145157255,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004849829964193726
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.23766382636060715,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003915272947542149
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.244573711894895,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003835454456407674
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.07564028912702611,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0031166559518488166
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.06284434238677651,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024326471023516597
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.06539576616966959,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002549322979655533
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.21520151898487322,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.004159378136303307
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.1832103735419546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003250379217526897
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.18891811330851163,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0032841796559050377
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.2158020371347959,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.004149734653819935
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.18401121788190536,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003258676196585185
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.1895940498397185,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003283363186227225
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 3.440764901759586,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1784268606441582
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..8774367e119d5dc16adf86d73bbf995231731959
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.07246383175431523,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00469786089881535
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.05629931108498489,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003640521050340188
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.058787434685230615,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0036645533335454812
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.01903215203677445,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018590330603459604
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.015828376332120486,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0015323609301644817
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.016318581481069933,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015664988644061195
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.05759037860265505,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00393018249775178
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.043864628765744866,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0028934618336297607
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.045917438807934134,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002931975565773022
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.058060643012514464,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003941612774228265
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.04443074379368858,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.002927987421647252
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.046389176010339,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0029498199662157673
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 0.07086985461456602,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.02998805114415839
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2b780deb90c7abb6ee7ee0476a8eaae47937ff3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_DOC_tldr_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "DOC_tldr",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6481ef94ac7568d20e012f1be6d1cd4784931f7a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.13999986827582567,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0022474794503698063
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.3146848228074294,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004975343003506539
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.19062853936311658,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029532646040511964
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.03163976089485668,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011861306107240704
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.07306747424744006,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002675800285732193
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04328828501096498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001566248398707883
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.10267416140662887,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0016806410262001903
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.23218112607700236,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003798988027549183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.139911595168302,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021985427116768684
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.11113494233445374,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0018618965914790858
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.2517399727563149,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004226741179316262
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15161941662067227,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024594718320593506
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7872267963517905,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.10045358377651255
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6f06abc33aa6ceb75bf2ef1df7cef7475885a8e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.198389498815838,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0036200528310182637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.27297271430803394,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004198789938626886
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21134911198456563,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003029508784148872
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.043045220973661015,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018919242792135235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0597065055598483,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023731544903471393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04534620252044498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0017858055311589297
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.15080384564385396,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0028920115769217117
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.2086058984013953,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033970509765821025
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16064653466424578,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0024169691348135945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.15275988244015698,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0028982643948039046
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.21320242569675277,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035810539802842534
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1633349245997364,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002468028232419068
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.0535765112268263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.11972597685190434
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2d8473683aab30c50f3de98e835090cab3b563e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.2607761013228957,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004351702830837392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2607885042839857,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004018215060642568
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2444857696168434,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034588270505411917
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.06344356630530605,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0026820157377482148
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06209709779478734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00246191601941908
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05852877798479666,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0023072565365240597
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.19907633090246687,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0036157515137942723
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.19814987180436297,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003287204314003516
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.18609020665428283,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002889852915757798
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.20074551849299913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0036053665449958016
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.20155142761693595,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003414189551327218
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.18820566058762211,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002901279776307625
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 3.0737243948901085,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.20707696514444188
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7f0128806ec27513f562bb5c568e99d368a7a5d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.26517071973676093,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004671054222681604
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.24180770628858178,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003940301667285207
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23929443585701224,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0037168757909286886
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.06573984745309881,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0027796960958526426
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05833153123977987,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023156387603366626
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05849869572638163,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0023240414297435643
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.203598693888488,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0039976969321419975
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18373116025857217,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003225733424944883
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1821366529554913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0030775956513009466
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.20485233666950228,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003995574052859589
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.1854022097149016,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003252182345953945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1835081526249607,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003085655347849612
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 3.3049815114104333,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.159904818084904
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea41e9a9a5900b8c99abf372a4015cc3186a1c25
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.07320382597863412,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004382339503654509
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.06154298544226265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0037288317198754368
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.06274888299344217,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00368347747261438
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.018885269050898535,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018067685701354235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.016055891153780352,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0015325179921096511
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.01636894923247962,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015408484981787023
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.057742538854948004,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003573097680574354
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0479152284127263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0029805271711328637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04906837355261336,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002968163725283441
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05793234813776215,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0035819297580720103
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.04803187451993844,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0029874284500312834
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04921620084471268,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0029754368968071495
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.14790698979130756,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.04384286085154373
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..26689e0d955ed00b9a3ce86d918ac82147bdcd9d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.002313901471635568,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0006326582061365616
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0020282790639805184,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005520696211086723
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0021324501644318945,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005815454829014453
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0004152380160032476,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00018397799224345848
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00038092195139919884,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00017352719125781168
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0003960521179313103,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00017805195394425088
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0019597074784293666,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005429844277966419
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0017448307905115785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00048409378403815574
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0018203546217228452,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0005047529402091446
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0020637585328133834,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0005760004715550667
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0018460791228919861,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0005185190358660706
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0019229642762192925,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0005389418518153393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.0193423208210636e-40,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 1.482688743257922e-35
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..efbfa8acab78c86f914c2af0996e230a3f2bb6a3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.14012944937421437,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001815307424647061
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.33928055295031845,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004205581767652354
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.19588864333192735,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0024295469670554144
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.028568538737182608,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010066612852665242
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.0723006950405151,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025924457641700845
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.04039396274671048,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014127466271823398
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.1022109965196248,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0012997576864037014
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.24906820252984618,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003169806335614786
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.14305945497627331,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0017548887014810013
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.11145031743937453,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0014910950299529468
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.27157007058633853,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0036049451066097176
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.15603837919866542,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0020168926814176018
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.535130559814857,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.08809834580896415
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..02ff2cce9db45aed1a82900cdf98ec01bccfb689
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.1950575300209308,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0036023058150002466
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.2922289142961424,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004398147986435147
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.21339819130609702,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.002983888004350639
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.04216057616807481,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018970859687410643
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06585825426835108,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002477991546098299
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.04676534492086212,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0018057298964850564
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.1465833707685466,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.002875377511839599
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.2195610151275998,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033993217008700853
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.15996882237844728,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0023436116694455617
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.15094912938206448,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0028937977529524183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.22997442326868872,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003759859465696649
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.16604704688504865,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024674421504619097
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 1.9675951573060544,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.10694227812444064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad526e53bfbc7007c581976837f098aeef2952ab
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.2638394192805916,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004540435287238375
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.2738037170319721,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004200587916428268
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.24732615209837655,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034733941519721734
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.07001609054360647,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0029271091177120886
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06909930052758312,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0025744304924619835
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.06386379302307044,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0024181690250233505
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.20450354273425303,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.00395614496467891
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.20875565805339058,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003359958014590378
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.1900422633935571,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0030029862330431846
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.2073880639894331,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003930522367662495
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.2151833283968645,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0036280670247936853
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.19389966865847785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003031622262926847
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 3.0346172155561546,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.15566119939690068
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c428dc789b1fe66a251e7a4a6b4370ba5b66a671
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.26034346215610965,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004714473900591052
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.24906248646319382,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00415456465298038
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.23780963877739292,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0037395882283707015
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.06660770770868331,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0028822154036991523
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.06179338888003384,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023891674540167077
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.05977234078723167,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0023701821865365984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.20086262918393394,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.004038583972504473
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.18959349271627773,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033254851364105283
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.1819474141212443,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0031519157034204308
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.20259177495104444,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.00403113378580041
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.19316090690700008,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003503292832588365
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.18412012740680855,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003175557497433766
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 3.425765641925955,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.1979335091537913
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..89d4a09b29ce08345c6373d1aeaea1cf81695766
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.07114285672279302,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004380915271764054
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.061764569030446005,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0037292099495240716
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.06088055060364192,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003575574283828377
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.01884955198069114,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002033397598483995
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.015445509919559106,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0014952091434256999
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.015399984554066971,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015076205121051984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.05564721298035322,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0035475780203014515
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.047419217528694754,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.002898877332795932
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.04700874906652635,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0028312951452581186
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.056405015194697566,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0035709061865885116
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.04839142944384069,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0029806554750605547
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.04779427597038301,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028660149390703524
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 0.2087867261327827,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.06562705889970392
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a279042f7a0d9f569cd604c051aabcc95312b658
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_DOC_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_precision": 0.0023440923719853565,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0006865168187194257
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_recall": 0.0018566841831715792,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005180019415016593
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge1_fmeasure": 0.0020309670290739046,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005791928983528292
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_precision": 0.0001917762530970078,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00011735823792007502
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_recall": 0.00015886629094176265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 9.83557528468589e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rouge2_fmeasure": 0.00017372564542375864,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00010697841454925956
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_precision": 0.0018456793659054301,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005204179651067365
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_recall": 0.0014866978138197782,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0004055688639589803
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeL_fmeasure": 0.0016117708575998733,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00044548916776186037
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_precision": 0.0018005407904788703,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0004985423695356434
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_recall": 0.0014571242644023771,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0003936140639568936
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "rougeLsum_fmeasure": 0.0015760361520538472,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0004295342382164876
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_DOC",
+      "bleu": 5.108639972341924e-39,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 3.956752962357288e-33
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c4e20f083b8444c7e6ce542aff3d3655c15b68a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.1474899344681375,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001900079131590382
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.3534023285854842,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0042808487059165885
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.20565685899580208,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0025302238335519456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.03327897686130466,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0010857124576225867
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.08257504981830704,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0027006296618498618
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04683864411040331,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015138269812197199
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.11026909240204424,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0013894987564095334
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.26619441069312516,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003270460032424841
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.15403195663828284,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.001865355995836426
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.1173973677189778,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.001580655687968673
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.283327063520947,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003708823027951565
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.16403204921526296,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002134168071742254
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.8347869480000898,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0869883864663994
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5973ab2dbcb289585bbf3d51c154e07481969a63
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.19566499021136027,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0035669769380662765
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.27331585723108776,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004448638835604697
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.21005056026385502,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0031333342568339744
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.04186277589901304,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0019629776118442544
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.058998681138224096,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024069269140667474
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.04475960677022685,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0018872319158043016
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.14767848173331127,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0029216531952624223
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.2044653904041559,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033809171614435102
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.1575307148373402,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0024757681091360194
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.15272574897907007,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.002932780684799666
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.2157182395445939,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003746152427089657
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.16437594892952237,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002587213479835073
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 1.938071033295324,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.09541184002040398
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..eecaed06ab786fecb0f41f409e7b24099416a06f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.2506617619333507,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004279548367868259
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.2771185146802034,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004038617755545862
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.2448482219235882,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034178077943435277
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.06019009652828162,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00257094644990412
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.06367536344844839,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0024323779221596563
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.057573246775443634,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0022648000197355543
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.18921156917814427,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003569423509420866
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.2072997416186527,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003206717340734459
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.18384384732035777,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0028420337598510335
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.19253397732901711,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0035416958117886277
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.2142839788487451,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0034386239695474597
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.1881941197455998,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028677725202235333
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 2.775277917992019,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.16055336437381154
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a91ee39a2f80b1003e818698f2d00c1cc2e6f1e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.2490418965952657,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.00477783797042605
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.2514874020658644,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004242277195358792
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.2328552562831893,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0037312986124307416
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.06439914737784772,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.002947420000073468
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.06084688838603643,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023670005477312694
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.05770154905937547,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.002284739031825248
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.1902696505668648,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.004048113449529588
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.189296062302524,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033439414911562394
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.17641797157296718,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.003071228472405363
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.19325864739656615,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.004019456619493295
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.19548749971046633,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035310682553972364
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.18026475633170685,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.00308320691761892
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 3.0847462253212723,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.2223901385109968
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d4efb6fec1fd47508c2f7c2cc3aa2ca5e9f56f35
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.06149614061062767,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004002014655670965
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.05939632565343678,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00385700196779865
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.055469206946848856,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034842031118360157
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.016873072153203366,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.001726552437885677
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 0.015979494842853718,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.001568928538748884
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 0.015079805978761029,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0014697982090914057
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.04847125659743045,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003240222188539927
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.046087232274086315,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003078628538409018
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.043274129761389696,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.00280337313824995
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.0491128410938864,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003254043057422175
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.04736823871678665,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0031650848308666943
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.0441116426489953,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0028366835005017984
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.1693742801410804,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.04377385517948794
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..95cc15f0e5adc3bdc7120fb7d7e5d0059ce1b667
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_gem_xsum_summarize_this_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_precision": 0.0035734705546026303,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.001375063651136652
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_recall": 0.0005441463597927607,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.00021036208677550778
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge1_fmeasure": 0.0009197860202658167,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0003511163847714274
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_precision": 0.0004288164665523156,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0004288164665523165
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_recall": 2.858776443682104e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 2.8587764436821168e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rouge2_fmeasure": 5.360205831903945e-05,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 5.3602058319039565e-05
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_precision": 0.0032875929102344197,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0012833920426189107
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_recall": 0.0004905443014737213,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00018887263781695456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeL_fmeasure": 0.0008295088694126976,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0003145590430095732
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_precision": 0.0032875929102344197,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0012833920426189107
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_recall": 0.0004905443014737213,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.00018887263781695456
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "rougeLsum_fmeasure": 0.0008295088694126976,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0003145590430095732
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "summarize_this_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a7e5e03bd7fcd26ce5c6d45614582014d488e41
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 6.278734834615668,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.2227369064421852
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.08160371186471228,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.002374549489609363
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7004950818659318,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.007036802366632382
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.13373991461055282,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0030603070260723855
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.06495513323593309,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0021769496767186734
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.547686255496274,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.008191322897272223
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.10747014085856893,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0030428865011832523
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.08082169564914758,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023411236390164757
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6966925221944833,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.007073853726693974
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.1327187453488244,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003043515110100461
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.07932773548315011,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002353587483256341
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6812084956321673,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00725244402825127
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.1299744874996082,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0030517622714577117
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbe273c891b524aa19cba26fbc62bc6c5d8d2b55
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 60.18185050682653,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.1724481335740067
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.6786023985669906,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006879636404319519
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.659172864309111,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.007228925418519524
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6518546551360388,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.007116978768503563
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.53388497210983,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.008002884678287275
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5268639680421803,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.008159634832380563
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5204293645702318,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.008052025950724288
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.6623660397033849,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0070499025881858735
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6470537171924869,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.007432342342313776
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6391270001438089,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.007323388651758978
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.6652433097208901,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.007031373219079611
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6488361635192536,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.007399980576394991
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.641084851380832,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0072905490236244595
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1570fa5b99de158f3074cd38ad6d7e7ea2d3f0bc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 64.64005573569813,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.1125840968414413
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7115721847853325,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.006383049267455568
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.6898981524653733,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006769920097945114
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.6887808582968012,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006614099300805803
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.568469754070017,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0077553126930507785
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.556983036110333,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007909142169515245
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5558632920359011,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0078092241499811525
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.6970976003322448,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006591012707429238
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6783489276320744,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.00698850669069673
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6768058280329439,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006840870337470356
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.6995156426369364,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006561376900903898
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.6801204068768889,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006951032244743205
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6786509336030443,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00680233159395632
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c77c4e096afa10c0317e36c096f5e914b7fda075
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 68.22165305419762,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.8890448430097019
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7221627149025683,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0062305282040743
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7108029019292199,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0064897327358770865
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7073112938146661,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0063804858035251604
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5820508960354065,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0076299576959030615
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5773033129345385,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0077930736641026125
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5744304096662205,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007699595228276496
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7087419186042014,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006458269024355693
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.6993109497667697,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006723713657261213
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.6956339607612617,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0066200787187141895
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.711133750119118,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006421232852010984
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7011411370750749,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006681507675650779
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.6975066700910248,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0065771947909297095
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..838f93bb051088f7ca72f77c8c8bde5e12275782
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 69.84524760921622,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.3023157448019556
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7345229851633986,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.005986767051198255
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7253509489635235,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006227746850689511
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7217534759948284,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006121513376796184
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.596040626875803,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007501433138942847
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5915933328294807,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007623825675456275
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5886766303319366,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007540513357647879
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7223498063566517,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006229876667011842
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7145458072279548,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006462844404701978
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7108619060602632,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.006364463104987073
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7243706723600197,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.006187955065543882
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.716189446557702,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006421987983352356
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.712522166886964,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.006322893899736439
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..830401eab0c824f7dba9be6c08c07b32f974b0c0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_Correct-the-solution_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "bleu": 71.08648151603752,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 1.0423319434802825
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_precision": 0.7364977609872968,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.005944810805693522
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_recall": 0.7306819844992705,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.006157866649224208
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge1_fmeasure": 0.7261239196945068,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.006051448617490669
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_precision": 0.5975955689445211,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.007475842785462623
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_recall": 0.5967441077322465,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.007579782597891848
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rouge2_fmeasure": 0.5928294767677813,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.007504735496121762
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_precision": 0.7246335948521524,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.006182415774653073
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_recall": 0.7201906425984315,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.006395020962183312
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeL_fmeasure": 0.7154921729488967,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00629239525683029
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_precision": 0.7265795029105262,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0061418478208074
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_recall": 0.7218218633881486,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.006352649666067237
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "Correct the solution",
+      "rougeLsum_fmeasure": 0.7171276634610969,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0062501289605129545
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb7c3a19409518fda518334eb5276c80f3bd1dd3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.4896626768226333,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663330673075898
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.4896626768226333,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663330673075898
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..badfcd1c87bae3df5fc43e5b7254d9e74f74f9de
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665713661738877
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5021762785636561,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665713661738877
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b59df6ada2d59a4d0242bd8f2b93570575ee361a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5201305767138193,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011656365410780372
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5201305767138193,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011656365410780372
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..005bd617c171cf9cd6c90f9e152841d1637628ba
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5244831338411317,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011651830225709979
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5244831338411317,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011651830225709979
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..d93238a5647864de2d80863aac4fc57a389f729f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5081610446137106,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664270112244237
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5081610446137106,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664270112244237
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..44dbf17b170cca755fa4cdd971ecfdde0e100b07
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_choose-the-most-appropriate-solution_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc": 0.5212187159956474,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011655314732288861
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "choose the most appropriate solution",
+      "acc_norm": 0.5212187159956474,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011655314732288861
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..802d1b5d43308e28c12bc5b6f645a96783a3693d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.17066898481393913,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.00803261594895346
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.019988781791314362,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.000500033258601662
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.2189943530475805,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.004079283289986538
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.03478661571339285,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0008014750154673911
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.0031299626827861727,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.00015456359119012437
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.038130006762105165,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001999093625706957
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.0055069416857675855,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.000264484767130222
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.018169764912390914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0004309343684239099
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.20275069372478832,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0037184478182369536
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.0316936420027784,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0006939152149686633
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.01648814395051073,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00041116448513992096
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.18839597007095757,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0037196421404839127
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.028758460694468576,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0006600358478717193
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b0dc7eed21ae9b635ca3553c9168810fb2f7a55
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.31915048013706,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.02620039827296871
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.08315699283905327,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003419242714843206
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.1670935715647722,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003999059624587211
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07886064580447584,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026219177447909005
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.018370065672256373,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0014347031808393837
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.03157703113403232,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0017958384553041604
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.016827951371953354,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011812390790814646
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.0673356509458437,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.002737988776916549
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.14655813236600299,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0035859866081953853
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06542274863101005,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021692648679794255
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.07012972287523524,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002969174912446342
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.14278060144306878,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0035650821245661137
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.0661469228892784,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002260599148736579
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cfef1430fec97335165593b0e0f6fd2015907fd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.5038883821200564,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.0604617926818546
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.06764853867416688,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003565719899866043
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.0646429180266884,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003083663833624245
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.05198294358223645,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0025042512053083032
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.015459698428139546,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.001643754856124599
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.014396007314664165,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013434842111948982
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.011752668395362776,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011342195396425946
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.05781587124982036,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.003092606353558658
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.05674010175258449,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027470600077108814
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.0448509387316729,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021869950723516175
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.060347710618179395,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032401079418637705
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.05744446846929318,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027724301585786448
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.046043469207693984,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002239044296435396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f12dde92a30049c736af8216543fa67ede8e5bd1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.417256690773128,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.05417089218251962
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.07310603926070969,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003697200472520524
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.06537860586537475,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003132408607889633
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.055698333051118806,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026311711420543356
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.01698481574652303,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0017805476081186934
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.015257654221563844,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014614481898404892
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.012670702709971336,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011548610533438693
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.06317321591157367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.00326988264639926
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.05769516321706676,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0028323811792588284
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.04835453953831574,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0023265907591892786
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.06558154945562647,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0033791812198662656
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.05918648566874358,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029013049659238066
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.04994572581381962,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002394485017637533
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..72506a2b9b74ca84ce3d610ab74c9fae8b1c0d29
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.5230628724174664,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.07819793788240743
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.08730073095091566,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0039923132219365646
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.07777229717916978,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0033592902246069372
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.06832044026429208,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0029205130223798324
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.021287957435449874,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018928999405201379
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.01847896121529526,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016529295281884947
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.01641917483304456,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001396907601907277
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.07479886960092183,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034906398849194483
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.06818337939770107,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0030433206990172095
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.05888348175750054,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00257744328152746
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.07758866056449988,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0036373270512517343
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.06936506932934977,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0030738111400345396
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.060425478261301964,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0026348754382351006
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..63b07221b3a1346375b9441e7945e0c0d3f5c972
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_no-prompt-needed_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "bleu": 0.6835675289713131,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.09201535652694026
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_precision": 0.0970156487496315,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.004126042501770336
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_recall": 0.08622233259578811,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.003503196213825024
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge1_fmeasure": 0.07518330292771172,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0029704830489176897
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_precision": 0.022354315529872364,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018287202748284283
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_recall": 0.018862588281324987,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0015455208203762356
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rouge2_fmeasure": 0.017013197441592458,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001321185275390085
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_precision": 0.08321458322323663,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036022749449818596
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_recall": 0.07523134647928398,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.003155331153332636
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeL_fmeasure": 0.06482059902196627,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0026255826270300616
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_precision": 0.08611431602639111,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0037595269875219453
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_recall": 0.07664158416334983,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031935933997685262
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "no prompt needed",
+      "rougeLsum_fmeasure": 0.06646799284350768,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0026802307096209666
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee9c783466bd74d60f65ad4b11128106895b290b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01166526473007815
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.49510337323177367,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01166526473007815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9da1d3c4d8b789ed48281c2509bb392b0799b286
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.4940152339499456,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664988455853323
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.4940152339499456,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664988455853323
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ded8947cde966f3e42782bc04f33e2b69ee645af
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5070729053318824,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011664656918145945
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5070729053318824,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011664656918145945
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3096aeab11603f3a290484c374ac017cbb58bcaa
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5228509249183896,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011653634832401168
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5228509249183896,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011653634832401168
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..f38f0b0a09fe666b378a812fadab51c57dca3a65
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.5092491838955386,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011663828032649181
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.5092491838955386,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011663828032649181
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a64416a0d54a298acc432cce6753bcae01ffe4e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_pick_correct_choice_index_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc": 0.4961915125136017,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011665485744746797
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "pick_correct_choice_index",
+      "acc_norm": 0.4961915125136017,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011665485744746797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9202c3e13c83dabd3c00631576c71adcf3a83598
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.558215451577802,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011586482494310218
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5603917301414582,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01158041724865657
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..03a81a41d70a9d11f031b69c1838320bf1706a71
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5522306855277476,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01160199979686681
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5527747551686616,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01160065944329292
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..78607b576c86e44b6dc2f4b0aacd671fa4b4f160
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5500544069640914,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01160722083798011
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5495103373231773,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011608491028638188
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6345bd6c1c0eba6a6527eed21e96854639301060
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.543525571273123,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011621538875661537
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5424374319912949,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011623729421518137
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..81f56d2dcc2ee426426b525f2ea9d27259c688ca
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5478781284004353,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011612217507379627
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5478781284004353,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011612217507379627
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a4ccc5bc84bc5ba472711043e74540761dc35de
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_piqa_what_is_the_correct_ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc": 0.5522306855277476,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01160199979686681
+    },
+    {
+      "task_name": "piqa",
+      "prompt_name": "what_is_the_correct_ending",
+      "acc_norm": 0.5516866158868335,
+      "dataset_path": "piqa",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.011603326108334502
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..66d66b0a4da6bf29ebb275f99c8972017e419880
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.598,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015512467135715075
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.542,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015763390640483706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..58ddfe45ff20dca88c570eecc47c48bf17b0757f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.662,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014965960710224475
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.645,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015139491543780529
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5d31d0bbf9d80576c7b23ce7bd74e3142276d6f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.664,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014944140233795023
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.667,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014910846164229864
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc7dbcdc55f67d1a8156522a26cc65913b11417d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.675,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014818724459095524
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.692,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01460648312734276
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c3d6c7bd90be64f4366e4bfb29601e33896e6ef
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.687,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014671272822977886
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.701,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014484778521220468
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a2a66c9edccd79a697f60a6cb5097259f6bf22e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc": 0.692,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01460648312734276
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question (Closed Book)",
+      "acc_norm": 0.691,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014619600977206494
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9836c57d8125abc01be1fe673d23033a7147f625
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.855,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.011139977517890145
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.77,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.013314551335935948
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..88026bf321862bcc7b347208c3fb50f59b4f4b51
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.906,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.009233052000787733
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.897,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.00961683333969579
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d74eaddae795b7ac2dfe87615f720f5d6d2c3b6c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.913,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008916866630745894
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.91,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.009054390204866435
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c7f7fb1fe8fd1e5b0f2c504cb2a09030cd7c746f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.921,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008534156773333456
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.917,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008728527206074798
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..099c15cf8279d0f801354643e572f454e08e2921
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.922,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.008484573530118583
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.919,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008632121032139978
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4c0afb7472f4e586e663a0f6d0ae0953bf9e55
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Direct-Question_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc": 0.92,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.00858333697775365
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Direct Question",
+      "acc_norm": 0.922,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.008484573530118587
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed91e36f90f7b8f71b798fdec5075149107e6f45
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.342,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015008706182121731
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.362,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.0152048409129195
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..103d0b7f79bfa75f0acc01c8465a40eed83790b4
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.341,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014998131348402702
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.341,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014998131348402704
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b91ff42708bd2d36c5fdff37969c4afa46167dc
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.335,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014933117490932577
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.337,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014955087918653605
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7acae7f872a90b1b10ddbd7d0a8ccdb45f10e6a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.317,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014721675438880215
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.335,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014933117490932573
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..88c6a4462f1937cbd9a563d3f0dac50346f478e4
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.328,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014853842487270336
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.342,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015008706182121728
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..61377fc9da5af6a136267da1d8735ee068851078
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-(Closed-Book)_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc": 0.338,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01496596071022448
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice (Closed Book)",
+      "acc_norm": 0.334,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014922019523732956
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3b91d95967e10e8d8ad9c4ec4e7210770fe45a4
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.329,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014865395385928364
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.342,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015008706182121731
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cc88057bf905648597c01182a0ae50366bfe130
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.304,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014553205687950436
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.318,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014734079309311901
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..71bd2ca54deadf532bfe69f7c591c0ebe52d6eab
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.294,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014414290540008213
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.311,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014645596385722695
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a52efa88cc67263f47d7a0232ef00c84b5b5b3cf
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.292,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014385511563477343
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.314,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01468399195108797
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..a51a426906dfe9aa9f1fed26741e2c1d69265204
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.316,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014709193056057121
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.332,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.01489959724281148
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..caa720cfe4bbf42b05d3123319a968e9fa25c23e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice-Question-First_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc": 0.328,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014853842487270333
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice Question First",
+      "acc_norm": 0.327,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014842213153411237
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8e12686cf8d13988ebc37b6fa97dad6ca510a74
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.359,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015177264224798592
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.36,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015186527932040126
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..846ac0e7342de49f61d3e8134f8bc2c68c1c78c2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.359,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015177264224798596
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.378,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015341165254026649
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..91db6365387ef90162194367e53c4df78c05c1bf
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.322,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.014782913600996664
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.333,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.014910846164229868
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..701e16952ca643d185845dda5d8923816dafa37c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.339,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.01497675877162034
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.351,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015100563798316403
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c48f60a915270f4abc64371fce12f6d49b62fa83
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.346,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015050266127564441
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.358,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015167928865407557
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e695f437ab9d1b7cc1ff6dbc34240949fa37c465
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_sciq_Multiple-Choice_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc": 0.346,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_stderr": 0.015050266127564438
+    },
+    {
+      "task_name": "sciq",
+      "prompt_name": "Multiple Choice",
+      "acc_norm": 0.358,
+      "dataset_path": "sciq",
+      "dataset_name": null,
+      "subset": null,
+      "acc_norm_stderr": 0.015167928865407555
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9d2c497911a91f6a21297f210bb9c63717c3405
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.5104222340994121,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011559920087347776
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.5248530197755211,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011548139823074772
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4105085d23b812c39c9a90f4272f29da9c6cf795
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4778193479422769,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011551049647290314
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4965259219668626,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011562153149168287
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c6942ced4e45cdf70583c09ae465da863b76711
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.45911277391769106,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011523708060182086
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.47140566541956175,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011543509045585211
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..96bd5dcae01deb8b4390728e3d4979382450345b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4489577765900588,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011502027057558886
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.45323356493853556,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011511744771088355
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..de6c182c277e07c494c98116c7dc6db804aae386
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.45056119722073756,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011505771738769861
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.4452164617851416,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011492819519292359
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a2931fc789b028fee5ac0bcf8bd24c67b531a08
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Answer-Given-options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc": 0.4409406734366649,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011481489309428048
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Answer Given options",
+      "acc_norm": 0.44468198824158206,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011491450380971893
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b71f7e348b975da51878e1bf69cccf64fd43cdbb
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.5104222340994121,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011559920087347778
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.5344735435595938,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01153491734135513
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f915932bef248a25cce502262caaf280f1b64ae
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4730090860502405,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011545573278697237
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4991982896846606,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011562417388300208
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..98ed5902d08926280e30dce803047d23c0230b84
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4559059326563335,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.01151738312396153
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4681988241582042,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011539022035111231
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c318e857098fd2fe1bb3594e5aeb8f033499d07
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4585783003741315,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011522687288692525
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.4730090860502405,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011545573278697237
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff0e4d1d6cd2ff6bb7e436e4a2bc1512dc67fcff
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4462854088722608,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011495517440721683
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.44735435595938,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011498161586686657
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..aed2199ca81fb9b30f15cbf62537ee950378be3b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Choose-Story-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc": 0.4575093532870123,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011520605695184077
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Choose Story Ending",
+      "acc_norm": 0.45537145911277394,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011516282203726656
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..64f92fac23ece545d5f4270a577dd8e0cd2b5715
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_0.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..805e2a0cd6583b34ed239cf92ed3f354f43b6e4f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_1.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..883396f89aca307a3e0872e0f6dbed01a7f0a22f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_2.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1da18c21cbe0da21625199e9251dc268d4c1b71
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_3.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..87aa18d47598b541ed22a0865ce6bc9fe8800f07
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_4.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c532df67ada25897fb5fde42f781c9187393195a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Generate-Ending_5.json
@@ -0,0 +1,15 @@
+{
+  "results": [],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..101af4ce1659fa29c3214d2713595b0829c79c08
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.515766969535008,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011556682042196382
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.5259219668626403,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011546883081384905
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae9aca968551b3a7265fd44d5d5dc127ac70c205
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.48583645109567075,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011557792331301667
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.49599144842330306,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011562060664045738
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a31d97e420803e21ce9c06095ef2972b60057fac
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.46285408872260825,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011530479981182623
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.46873329770176375,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011539803085637733
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e55682b9ccae6961d2863be7fd438848468b7966
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.4585783003741315,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011522687288692525
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.464457509353287,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011533182338113986
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c8661086919789a77281b8ea484a7eb0dca3c03
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.45056119722073756,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011505771738769861
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.44681988241582044,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011496846233300528
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1053abba537d47744340f54aefb4384902b76013
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Novel-Correct-Ending_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc": 0.45163014430785675,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011508201145928352
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Novel Correct Ending",
+      "acc_norm": 0.45163014430785675,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01150820114592835
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..067b464a1afa384d9f0352d7f944c1ddb11347e4
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.5034740780331374,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011562153149168298
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.5285943345804383,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011543509045585206
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0cd09ea3fc263dc691bfa7d269d0562d9836f762
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.46018172100481025,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011525709570367512
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4949225013361839,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011561836054238772
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..13c7af982ecf5d70900a66707c4988a39121af09
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4521646178514164,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011509395748220111
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.467129877071085,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011537420054210294
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5169980eef2697f7c1f706d2fe841851ae4442e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4478888295029396,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011499463505491369
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4596472474612507,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.01152471548624064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..45fae60525bb5e8224a6b4d4eb48accd988c3d41
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4393372528059861,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011477017982308784
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.4398717263495457,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011478521926587444
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d73cf9f6d6909c6d5cead8a81e187a6ed2cb10c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_story_cloze_2016_Story-Continuation-and-Options_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc": 0.4398717263495457,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_stderr": 0.011478521926587435
+    },
+    {
+      "task_name": "story_cloze_2016",
+      "prompt_name": "Story Continuation and Options",
+      "acc_norm": 0.45056119722073756,
+      "dataset_path": "story_cloze",
+      "dataset_name": "2016",
+      "subset": null,
+      "acc_norm_stderr": 0.011505771738769863
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e70f3d683f82a70294511c4d9a70aa352df5786
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030039730592197812
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.47653429602888087,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030063300411902652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d726d4b75f59852bb91a6b6854d93b7885cbdf9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.48014440433212996,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317194
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f347063d9b31e1626bfb1f09bad775971c7be78
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030080573208738064
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..63df48d7b8d12584df1cad29455d60abf6afc56c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.49458483754512633,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1df3641c3a1c044065e4a85d24bb6ccc7b92938
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.4693140794223827,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03003973059219781
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.48014440433212996,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317194
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c01b3ee261b76593ace4b4e8361531918b5efea5
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_GPT-3-style_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "GPT-3 style",
+      "acc_norm": 0.48014440433212996,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317194
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..300b7aff7fc8452723834f7f0bd46f7c604f76b0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4693140794223827,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030039730592197812
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..947ee7af47b4182c5d7aa2bfaf77a7604e448410
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9f57f086396b546667ef00202b51e7e0055650a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976626
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030096267148976626
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3f1565281ac1319cd590856c25669cf48c02a09
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.4981949458483754,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..056966b0bb1aa8967adec3bf6845ca200cef9008
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5306859205776173,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03003973059219781
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1c59748fb1f66bc0cc143f04e1877d7240019ce
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_MNLI-crowdsource_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "MNLI crowdsource",
+      "acc_norm": 0.5090252707581228,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..98e011a1ace9060fcc8614898668680644190a3d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.4296028880866426,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.02979666882912467
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3fe68f1b38af17987bcaf3385d0a536e82d673c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6ec7a0251e1f414f55d1a5c42cc6831ae8bbf6e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..596441db9a97d446d9c74504f2def2d5134d8ad1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317177
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317177
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b843279df07761281cc8cd7f2af872e7f8c62edd
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030063300411902652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f1206eb0b3cdfd8d989bfe1e21a9db5b997b76b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_does-it-follow-that_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.03006330041190266
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "does it follow that",
+      "acc_norm": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030025579819366426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..994ea3131505def86ca4112b298282d1148a3712
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.48375451263537905,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..46f3c9cb6604485900e8bc1665b6201305d19244
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..33b73e5a10fb84525d38c619c2cc06030abd7778
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976633
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b68c876c1e33cfb5a38ff3b8ec00f7dc8e123718
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5018050541516246,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030096267148976626
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4467fdd51e071603cebec0a72702d6cce200292f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030086851767188564
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd37d1c7e7d87391a594587f18883d62a128a8de
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_guaranteed-true_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc": 0.516245487364621,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030080573208738064
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "guaranteed true",
+      "acc_norm": 0.5126353790613718,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030086851767188564
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd3090488e76407720840346d4ba2f5006be7f05
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.47653429602888087,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030063300411902652
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5270758122743683,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030052303463143706
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1a16b810bdc133f47c586a8bbd8088621e61f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030091559826331334
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.49097472924187724,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030091559826331334
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..83c84ea7add0cbb164cca259d705fc376c2bced0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030094698123239966
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5054151624548736,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030094698123239966
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..f47c33b5afab5525d73b2bc3d9f8879cbbc4c142
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317184
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030072723167317184
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..4196e4001158acf03c58c39f31da920023342099
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030025579819366426
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5342960288808665,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.030025579819366426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae657f47b4692dcaab27b742332c5cb89d92becf
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_superglue_rte_should-assume_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc": 0.51985559566787,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_stderr": 0.030072723167317184
+    },
+    {
+      "task_name": "superglue_rte",
+      "prompt_name": "should assume",
+      "acc_norm": 0.5234657039711191,
+      "dataset_path": "super_glue",
+      "dataset_name": "rte",
+      "subset": null,
+      "acc_norm_stderr": 0.03006330041190266
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..080ced4997ae2eaa71f51fcde3a96fbcaddd0a3d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140492945362904
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.505130228887135,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01405174596179051
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7257e5af9d228e80167d513afe4de1f82d37d21b
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5240726124704025,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014036189665395134
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5122336227308603,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014048278820405621
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..969cedbcfb7806edf152ad31f4387f66ddaa83e6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052481306049516
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7367ba57e410287b0fe684873c5d1c68d744a76
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.0140492945362904
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5138121546961326,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014047122916440422
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2875b5e460615fb22d3ebabe06c849c2732cd639
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5217048145224941,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014039239216484626
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5169692186266772,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014044390401612967
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f44556d4b310260b2d35bcb0b67406bbaf3f3c3
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_Replace_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc": 0.5382794001578532,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014011242594964123
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "Replace",
+      "acc_norm": 0.5311760063141279,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014025142640639513
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..59a7c549ec48e86133e2018b118e4c41bce7a241
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4956590370955012,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076896
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4964483030781373,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01405213114691586
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd9c6df321a25c065bba813fd6b4d1a3329c9b6c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052446290529012
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225629
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d09f4a6cc8d8dd81ccdb204bac8b202fb027411c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.4940805051302289,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405150083848581
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4980268350434096,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225636
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8797c6ca4b1c9754c8709a09a424a01a7180e1e6
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.49171270718232046,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050555322824194
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4940805051302289,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051500838485807
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..31be0edb433b32929d354bf9d3f7fcb62da6436f
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052131146915857
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.4964483030781373,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052131146915864
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b1432542f9e9bbb2c40d01b3afec3d44d77da95
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_True-or-False_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc": 0.5098658247829518,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049749833367589
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "True or False",
+      "acc_norm": 0.5146014206787688,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014046492383275835
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb2fb08add4f144baeec720c903aee42e1d1e5f1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5130228887134964,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047718393997663
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051956064076896
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..06a4991d2e427860b2440ea8832f0978c3a9624a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5146014206787688,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01404649238327584
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.49013417521704816,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049749833367596
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..207f50e650ca7419b74a97a0a707d28f91b52d6c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.4861878453038674,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014047122916440419
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.48303078137332284,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014044390401612978
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..02d4cb7fad02f70c701259db9da66dff7a73f1b1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01405237625922564
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.4940805051302289,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051500838485807
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b888a474142b4b38a0e67c6073e602b75738d2c1
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.4711917916337806,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.01402914161590962
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.46408839779005523,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014016193433958298
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..972410913ed49060c98c82edf9012c4403f5644d
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_does-underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc": 0.48539857932123126,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014046492383275839
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "does underscore refer to",
+      "acc_norm": 0.47434885556432516,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014033980956108557
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a9377d3daed078f73dcaf8a9876f332295e1e79
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5074980268350434,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050905521228571
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5011838989739542,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052446290529015
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..73a0d00b6a9ae80544a879868eed825242555d92
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5185477505919495,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014042813708888378
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.510655090765588,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014049294536290396
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..da4b3513491abe1984cf0dba92e1c212f6d2cde0
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5114443567482242,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014048804199859325
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.01405213114691586
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7a1634f691a1e9f0bc0876ce7bb3696065b788c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.516179952644041,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014045126130978601
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5146014206787688,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014046492383275835
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c95d31a02756acc8fdeb3590baca81964e8250e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5098658247829518,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014049749833367585
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050555322824189
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba5e1150f35b3cbed394020290a6c8f846f31dd9
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_stand-for_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014050555322824189
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "stand for",
+      "acc_norm": 0.5067087608524072,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051220692330349
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_0.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aaab9d3b76d0be7faa0d9f8d355944c0bf8c499c
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_0.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.500394632991318,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052481306049516
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.4996053670086819,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052481306049512
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_1.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1906559cc5cb938ba1e06add60c1834a2caced1a
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_1.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5027624309392266,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616441
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5082872928176796,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014050555322824194
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_2.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..20e57c3e3db6ce2622952c78a79863c0a9a2e0c2
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_2.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.4972375690607735,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052271211616433
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5019731649565904,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014052376259225627
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_3.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7eef82e92a54322d7a67b0cd903b598cf4ba085
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_3.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5035516969218626,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014052131146915853
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.0140519560640769
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_4.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7455365fb23a8ac076fac5cd200edb224421985e
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_4.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051956064076903
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.494869771112865,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051745961790516
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_5.json b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..24e4c1473fd430d77b0a49780bfd1b17f05e0152
--- /dev/null
+++ b/4b284b42boscar/eval/slim.4b284b42boscar_winogrande_underscore-refer-to_5.json
@@ -0,0 +1,34 @@
+{
+  "results": [
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc": 0.5059194948697711,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_stderr": 0.014051500838485807
+    },
+    {
+      "task_name": "winogrande",
+      "prompt_name": "underscore refer to",
+      "acc_norm": 0.5043409629044988,
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "subset": null,
+      "acc_norm_stderr": 0.014051956064076903
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c79fb6783892f130e9e4b13832b45ebcecc42f89
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccb2b09c11c0cdde3745808d33bc0ab1be7641681feeff5cbf9d5bc9642ec045
+size 4126256
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..143a95eab3754c3ff91e7126464f8acb45d3de4e
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:966e9e17704f7c1fc9554826496a7ff5ec1198039b659aa0dce53c1e9d3b84dc
+size 4634034
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..28c1c20abbceab802b839b15a1437ba9d9bc3bc4
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cdc6d76f3d5a64e6d50dc77a2bee47135d1cf28dccce54ce8a0fd8922acf1b3
+size 5539966
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f71cf612cef29648c8ac15b9fe6afa6d7c5d44a9
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9e4d89579b4ad9454e9bc89bd05136c079b6f9cf9c242c966f62b1558102a44
+size 6449146
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6fe6d0b63c27f3962a94185911de0bfe1f439235
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ce7a556f0619db806fe1066df53b03f48dce3325102a2db7b8bc617557b7b3f
+size 7300675
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..60e0ce4365863c1ba9a97da912aef0727f7ef391
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55fe4907d7fb398b10789ec47ee4c7c7c6ea670258c4e8af3876f27cf62cbdf8
+size 8165397
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c528bf9d9d825a5e4617f210a68e9c3768a294e9
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a5b598a497f006f1437f0ec4b8c7ccd6cb1ffbf33e72a04e9c983fd4982a1e
+size 7334204
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02f710360471b5ba1119e57b7f49341451d07b73
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:184744754a91fafbaf02543f7d32d6660179516d0a2b7f2db712a429e3974e93
+size 13065478
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c41116bd44b736ea70798fdc86afc5d6d3394df4
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9c913819d4cf01fbe63e8a6644e1ce246b1e9171f68e4dfd1a9a5638fe2a211
+size 18684310
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..97d33bd1e24e7f78a7cd6e0dafd31e92a2836cbb
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b114956855b37bd7c3612b1ce538904aee01698572f53ea0356605dafddef633
+size 24126919
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..613cba4d88d9775fca977b9e0b6157a110c181d5
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:188be788d7857377f944469692b37f6697aa682b20222597041ebe151c789959
+size 29394906
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..79a8f53bf6e42dfe5edfc239d6c07458cc718dc8
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16f61440b6dc7ba4fa37bcfb54cc775f28efe4e0dffc55af134f4278de76a712
+size 34785534
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8c061848e976c94361d6b3d011ce58f3e7fd3a96
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab7ad2fd4b66065db20eb44ed01f1db245bf4880ccddc22caec33e1bc8198db8
+size 3986049
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14d7004960ddf9c34b3119f80c5c29822839846c
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b292f0fa7fd8fbe70711ddf34fa6f54471aaa732fa658b8443956f3bd5995844
+size 4993570
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..024b1bb48d30460b98874067cc17425c68aea2fb
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:962f45a1168ec8c72f4526d19dc4d35d05af5f841a98f6152b6b6b60c79f3db4
+size 6086704
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2e36b2feb8449c736eff9b89be441d59ecfed48e
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38a3e2122075fe7468613a8f6811d19d220e8d1684da9b57dd6a35e9d556ae45
+size 7177098
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23f220dab1a91b28a1705dade6de372b8368ee07
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:053e7dc588df7186d73b323d0f535099ccc6321fbc7f8756124458a4ada61ed8
+size 8260563
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3902dac424fc124a6816d35d148cb2c7ed8c38e5
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0110113f9700faf441b6f8f00d8b5f42bd3d7a125fd5177fa4faa18f245ba7a1
+size 9349053
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3627d4509277f23540ae091124ef6ccdc2cadaa1
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9225c64dddd02387b96a53927b16eafc826ba15fa1feebfafaaad95fc32935e
+size 2792207
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e8d71c6d25f5581325fb65d05325cf50c11a4d02
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afd877ec884eefc18982860544e957417de3f7dd9f698f1e7aaa04dcc32ea931
+size 4985035
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e64374938e8d617a4277adac9a3e3326eda55215
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b7a047d833d4e0e79662215434103afd53baec6f6936383f51487952c5a78ee
+size 7191499
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b4e19ec518aa4d4573c30c901ea93a490070755a
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f27d9e4a6c4d9a782ad6eb79d2216389bdb2c7960809b88d870921d1877325e6
+size 9458314
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f2a746f2aecd58877986d2173631404ad9a69802
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6046c747fbe2cb2e2842010756f5f747dcafa1fd4a73b5be3d6f7dcd139d7a96
+size 11628262
diff --git a/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eacd3b2e841848b23a6d55478dd0ee191e1d1e33
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/examples.4b284b42boscar_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35859ed75e8bb8038dc73d9f22e1430032c8dd49d7bc982eb10e37626e748328
+size 13897472
diff --git a/4b284b42boscar/evaluation/generation/merged.csv b/4b284b42boscar/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..a74cf3f8684d745fb365f959c8fd2461d159aead
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/merged.csv
@@ -0,0 +1,51 @@
+dataset,fewshots,prompt,metric,value
+e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.08042598706024275
+e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.08042598706024275
+e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.23118450232385498
+e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.23118450232385498
+e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.25360971138111243
+e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.25360971138111243
+e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2589859331129998
+e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2589859331129998
+e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2613413893640116
+e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2613413893640116
+e2e_nlg_cleaned,4,average,multiple,0.2171095046484443
+gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04328828501096498
+gem_xsum,0,median,rouge2_fmeasure,0.04328828501096498
+gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04534620252044498
+gem_xsum,1,median,rouge2_fmeasure,0.04534620252044498
+gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05852877798479666
+gem_xsum,2,median,rouge2_fmeasure,0.05852877798479666
+gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05849869572638163
+gem_xsum,3,median,rouge2_fmeasure,0.05849869572638163
+gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01636894923247962
+gem_xsum,4,median,rouge2_fmeasure,0.01636894923247962
+gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003960521179313103
+gem_xsum,5,median,rouge2_fmeasure,0.0003960521179313103
+gem_xsum,5,average,multiple,0.03707116043216653
+web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04944058280627724
+web_nlg_en,0,median,rouge2_fmeasure,0.04944058280627724
+web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08671853497591266
+web_nlg_en,1,median,rouge2_fmeasure,0.08671853497591266
+web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.1125014602113282
+web_nlg_en,2,median,rouge2_fmeasure,0.1125014602113282
+web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11478946258154298
+web_nlg_en,3,median,rouge2_fmeasure,0.11478946258154298
+web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.12790770467752932
+web_nlg_en,4,median,rouge2_fmeasure,0.12790770467752932
+web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.13474976693002946
+web_nlg_en,5,median,rouge2_fmeasure,0.13474976693002946
+web_nlg_en,5,average,multiple,0.10435125203043664
+wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.0352234336154376
+wiki_lingua_en,0,median,rouge2_fmeasure,0.0352234336154376
+wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.035746404962808155
+wiki_lingua_en,1,median,rouge2_fmeasure,0.035746404962808155
+wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06444156598905733
+wiki_lingua_en,2,median,rouge2_fmeasure,0.06444156598905733
+wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.055649339745118376
+wiki_lingua_en,3,median,rouge2_fmeasure,0.055649339745118376
+wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.018728966913288573
+wiki_lingua_en,4,median,rouge2_fmeasure,0.018728966913288573
+wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003212219718708495
+wiki_lingua_en,5,median,rouge2_fmeasure,0.003212219718708495
+wiki_lingua_en,5,average,multiple,0.03550032182406976
diff --git a/4b284b42boscar/evaluation/generation/merged.json b/4b284b42boscar/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfa5161db0372b843d48015770733d74765815ec
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3920095188313853, "bleu_stderr": 0.05821741416066471, "rouge1_fmeasure": 0.10974192875795953, "rouge1_fmeasure_stderr": 0.002108481379704356, "rouge1_precision": 0.07689265316885617, "rouge1_precision_stderr": 0.0021889644764877354, "rouge1_recall": 0.3353633364771332, "rouge1_recall_stderr": 0.005464034559346198, "rouge2_fmeasure": 0.04944058280627724, "rouge2_fmeasure_stderr": 0.0012733709222344247, "rouge2_precision": 0.03467177168437793, "rouge2_precision_stderr": 0.001263966142663848, "rouge2_recall": 0.1501022471408039, "rouge2_recall_stderr": 0.0034656326105627303, "rougeL_fmeasure": 0.10487292177099015, "rougeL_fmeasure_stderr": 0.0019145881199173131, "rougeL_precision": 0.07300762564486572, "rougeL_precision_stderr": 0.0019791385357395687, "rougeL_recall": 0.3239944481490108, "rougeL_recall_stderr": 0.005278665955249544, "rougeLsum_fmeasure": 0.10223587057095268, "rougeLsum_fmeasure_stderr": 0.001953957091865035, "rougeLsum_precision": 0.07173881674469755, "rougeLsum_precision_stderr": 0.0020327580166630914, "rougeLsum_recall": 0.3108299199198791, "rougeLsum_recall_stderr": 0.0050441398235876655}}, "1": {"PALM_prompt": {"bleu": 0.5923583934046589, "bleu_stderr": 0.04413735939354442, "rouge1_fmeasure": 0.1697750677390444, "rouge1_fmeasure_stderr": 0.0038772039426472724, "rouge1_precision": 0.16010050130964712, "rouge1_precision_stderr": 0.005125042778102285, "rouge1_recall": 0.3196355072168386, "rouge1_recall_stderr": 0.005119850964520928, "rouge2_fmeasure": 0.08671853497591266, "rouge2_fmeasure_stderr": 0.002631475278488475, "rouge2_precision": 0.08438981052215226, "rouge2_precision_stderr": 0.0035428710754014017, "rouge2_recall": 0.16375709757632376, "rouge2_recall_stderr": 0.0035801758170607334, "rougeL_fmeasure": 0.15421967175389004, "rougeL_fmeasure_stderr": 0.003335409750319338, "rougeL_precision": 0.14464852171051198, "rougeL_precision_stderr": 0.004587920039107864, "rougeL_recall": 0.2991072637349091, "rougeL_recall_stderr": 0.004697289445064134, "rougeLsum_fmeasure": 0.15698432412836977, "rougeLsum_fmeasure_stderr": 0.0034258507511626024, "rougeLsum_precision": 0.14773643541948675, "rougeLsum_precision_stderr": 0.004697318924441809, "rougeLsum_recall": 0.3022253960230997, "rougeLsum_recall_stderr": 0.004739701115545982}}, "2": {"PALM_prompt": {"bleu": 0.87481699127398, "bleu_stderr": 0.059582002804657114, "rouge1_fmeasure": 0.2091307571621392, "rouge1_fmeasure_stderr": 0.00438758838932181, "rouge1_precision": 0.199466981650156, "rouge1_precision_stderr": 0.005680133053279315, "rouge1_recall": 0.36987834122063556, "rouge1_recall_stderr": 0.004990035433475184, "rouge2_fmeasure": 0.1125014602113282, "rouge2_fmeasure_stderr": 0.0030818329485191713, "rouge2_precision": 0.1097332483868951, "rouge2_precision_stderr": 0.003880260643680688, "rouge2_recall": 0.19989822315140882, "rouge2_recall_stderr": 0.003891995132588205, "rougeL_fmeasure": 0.18777604196623215, "rougeL_fmeasure_stderr": 0.0037686741944170416, "rougeL_precision": 0.17708218948059182, "rougeL_precision_stderr": 0.004959206002368757, "rougeL_recall": 0.34473442974785634, "rougeL_recall_stderr": 0.004653107328809787, "rougeLsum_fmeasure": 0.19268792375031765, "rougeLsum_fmeasure_stderr": 0.00391282714541826, "rougeLsum_precision": 0.18275767079289593, "rougeLsum_precision_stderr": 0.005158519349266604, "rougeLsum_recall": 0.3495549840141228, "rougeLsum_recall_stderr": 0.00470047440035223}}, "3": {"PALM_prompt": {"bleu": 0.917052522385174, "bleu_stderr": 0.04515993268519611, "rouge1_fmeasure": 0.21372711407619938, "rouge1_fmeasure_stderr": 0.004343895044489938, "rouge1_precision": 0.20888904165496103, "rouge1_precision_stderr": 0.005880585457367179, "rouge1_recall": 0.37814776640994463, "rouge1_recall_stderr": 0.004965150831364198, "rouge2_fmeasure": 0.11478946258154298, "rouge2_fmeasure_stderr": 0.003021059673039041, "rouge2_precision": 0.1168574620986129, "rouge2_precision_stderr": 0.004100382644887659, "rouge2_recall": 0.2033520821796969, "rouge2_recall_stderr": 0.00383298408380243, "rougeL_fmeasure": 0.19232862909601875, "rougeL_fmeasure_stderr": 0.0036812800257055574, "rougeL_precision": 0.18605811290952207, "rougeL_precision_stderr": 0.005155344402670075, "rougeL_recall": 0.35240104768920266, "rougeL_recall_stderr": 0.004560857947768766, "rougeLsum_fmeasure": 0.1967270339578505, "rougeLsum_fmeasure_stderr": 0.0038181408662667426, "rougeLsum_precision": 0.19147784744930493, "rougeLsum_precision_stderr": 0.0053469173736333826, "rougeLsum_recall": 0.3571044253500232, "rougeLsum_recall_stderr": 0.004630285578564224}}, "4": {"PALM_prompt": {"bleu": 1.0770812969732624, "bleu_stderr": 0.057150178907157206, "rouge1_fmeasure": 0.23556795007723105, "rouge1_fmeasure_stderr": 0.004533277698287348, "rouge1_precision": 0.22981660265238718, "rouge1_precision_stderr": 0.0060091983198828705, "rouge1_recall": 0.39885875556534467, "rouge1_recall_stderr": 0.004954969825753986, "rouge2_fmeasure": 0.12790770467752932, "rouge2_fmeasure_stderr": 0.0031721283738045283, "rouge2_precision": 0.12750847088352804, "rouge2_precision_stderr": 0.004054232891970157, "rouge2_recall": 0.21938886368566654, "rouge2_recall_stderr": 0.003991587985278903, "rougeL_fmeasure": 0.20953786614403025, "rougeL_fmeasure_stderr": 0.0038198330880285674, "rougeL_precision": 0.2007562947847461, "rougeL_precision_stderr": 0.005097419240769892, "rougeL_recall": 0.3691187056685199, "rougeL_recall_stderr": 0.004545072578815252, "rougeLsum_fmeasure": 0.21611300792836619, "rougeLsum_fmeasure_stderr": 0.003996019817069793, "rougeLsum_precision": 0.2090073734779189, "rougeLsum_precision_stderr": 0.0053710149363667935, "rougeLsum_recall": 0.3759411168888221, "rougeLsum_recall_stderr": 0.004622209838017953}}, "5": {"PALM_prompt": {"bleu": 1.1504048600305352, "bleu_stderr": 0.0755822004199928, "rouge1_fmeasure": 0.24203388956598934, "rouge1_fmeasure_stderr": 0.004720213442449205, "rouge1_precision": 0.24380689117778276, "rouge1_precision_stderr": 0.006371373530201665, "rouge1_recall": 0.3952201821451148, "rouge1_recall_stderr": 0.005033224197794555, "rouge2_fmeasure": 0.13474976693002946, "rouge2_fmeasure_stderr": 0.003415392488984557, "rouge2_precision": 0.1418504594039488, "rouge2_precision_stderr": 0.004575582136553598, "rouge2_recall": 0.21936500213450158, "rouge2_recall_stderr": 0.004020020751334439, "rougeL_fmeasure": 0.21539137146469606, "rougeL_fmeasure_stderr": 0.004028386529865914, "rougeL_precision": 0.21441718014302874, "rougeL_precision_stderr": 0.005537317704553182, "rougeL_recall": 0.36454653195485753, "rougeL_recall_stderr": 0.004645366026599087, "rougeLsum_fmeasure": 0.222229551042884, "rougeLsum_fmeasure_stderr": 0.004199234047990457, "rougeLsum_precision": 0.22298936285767557, "rougeLsum_precision_stderr": 0.005807349708841443, "rougeLsum_recall": 0.37135465454110034, "rougeLsum_recall_stderr": 0.004699420865300012}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.0868626526463756, "bleu_stderr": 0.10334620651927755, "rouge1_fmeasure": 0.13547708241929612, "rouge1_fmeasure_stderr": 0.0026556069493799896, "rouge1_precision": 0.13065355742673593, "rouge1_precision_stderr": 0.003004776280579319, "rouge1_recall": 0.1786059794590456, "rouge1_recall_stderr": 0.003480497255310532, "rouge2_fmeasure": 0.0352234336154376, "rouge2_fmeasure_stderr": 0.0010798725770486046, "rouge2_precision": 0.03291780593276675, "rouge2_precision_stderr": 0.0010760829551416996, "rouge2_recall": 0.04663674589032859, "rouge2_recall_stderr": 0.001504992918818976, "rougeL_fmeasure": 0.10336654675385076, "rougeL_fmeasure_stderr": 0.001968971023431868, "rougeL_precision": 0.09987150235228559, "rougeL_precision_stderr": 0.0023685337113851174, "rougeL_recall": 0.13959473564876165, "rougeL_recall_stderr": 0.0027581190918920103, "rougeLsum_fmeasure": 0.12705867398208964, "rougeLsum_fmeasure_stderr": 0.002506787857940285, "rougeLsum_precision": 0.12288788441024788, "rougeLsum_precision_stderr": 0.0028720387887482485, "rougeLsum_recall": 0.16759622234914887, "rougeLsum_recall_stderr": 0.0032867178590762366}}, "1": {"tldr_en": {"bleu": 2.5514775583592173, "bleu_stderr": 0.11284261690010776, "rouge1_fmeasure": 0.16580650031794167, "rouge1_fmeasure_stderr": 0.0021193004105035614, "rouge1_precision": 0.19126686092152348, "rouge1_precision_stderr": 0.003082884423906421, "rouge1_recall": 0.2013507500089625, "rouge1_recall_stderr": 0.0029925213818961095, "rouge2_fmeasure": 0.035746404962808155, "rouge2_fmeasure_stderr": 0.0011192977008782524, "rouge2_precision": 0.044582320866862424, "rouge2_precision_stderr": 0.0018321118047088484, "rouge2_recall": 0.04442769282883079, "rouge2_recall_stderr": 0.0014782140733801531, "rougeL_fmeasure": 0.1274347165087169, "rougeL_fmeasure_stderr": 0.0015926173758816556, "rougeL_precision": 0.14988887413463797, "rougeL_precision_stderr": 0.002586002929489674, "rougeL_recall": 0.15563614848002308, "rougeL_recall_stderr": 0.0023345594461348924, "rougeLsum_fmeasure": 0.15656227686437413, "rougeLsum_fmeasure_stderr": 0.0019872168695675768, "rougeLsum_precision": 0.18119105294163743, "rougeLsum_precision_stderr": 0.002938341173461725, "rougeLsum_recall": 0.19011415446724061, "rougeLsum_recall_stderr": 0.002811944286183604}}, "2": {"tldr_en": {"bleu": 4.160843322149366, "bleu_stderr": 0.1298621943311205, "rouge1_fmeasure": 0.227527675745211, "rouge1_fmeasure_stderr": 0.002262727638760964, "rouge1_precision": 0.27518353034811777, "rouge1_precision_stderr": 0.003681886785126964, "rouge1_recall": 0.26994010795497553, "rouge1_recall_stderr": 0.003076759192772023, "rouge2_fmeasure": 0.06444156598905733, "rouge2_fmeasure_stderr": 0.001316135695695516, "rouge2_precision": 0.08493527530542791, "rouge2_precision_stderr": 0.002342907743897752, "rouge2_recall": 0.07562522541046568, "rouge2_recall_stderr": 0.0016670921194054875, "rougeL_fmeasure": 0.17356357206141904, "rougeL_fmeasure_stderr": 0.0017138094026774846, "rougeL_precision": 0.21378447499783487, "rougeL_precision_stderr": 0.003086057725507419, "rougeL_recall": 0.20706303996123673, "rougeL_recall_stderr": 0.002420883132061566, "rougeLsum_fmeasure": 0.21385148741041204, "rougeLsum_fmeasure_stderr": 0.0021335293397074695, "rougeLsum_precision": 0.2594677476381253, "rougeLsum_precision_stderr": 0.0035263951440822477, "rougeLsum_recall": 0.25382240907884707, "rougeLsum_recall_stderr": 0.0029022538349274}}, "3": {"tldr_en": {"bleu": 3.550269268318738, "bleu_stderr": 0.123976667805652, "rouge1_fmeasure": 0.19542809253461213, "rouge1_fmeasure_stderr": 0.002567437085860653, "rouge1_precision": 0.25197379414278026, "rouge1_precision_stderr": 0.004061114481390069, "rouge1_recall": 0.2241545120711431, "rouge1_recall_stderr": 0.003349453782690701, "rouge2_fmeasure": 0.055649339745118376, "rouge2_fmeasure_stderr": 0.0013436864732712206, "rouge2_precision": 0.07748731951936201, "rouge2_precision_stderr": 0.0024143985835070676, "rouge2_recall": 0.06396850914384923, "rouge2_recall_stderr": 0.001702594494628315, "rougeL_fmeasure": 0.1493802268370947, "rougeL_fmeasure_stderr": 0.001958022255827534, "rougeL_precision": 0.19629979985917206, "rougeL_precision_stderr": 0.00335622576475892, "rougeL_recall": 0.17220070773118773, "rougeL_recall_stderr": 0.0026273647249788665, "rougeLsum_fmeasure": 0.18360399651239948, "rougeLsum_fmeasure_stderr": 0.002422426009983074, "rougeLsum_precision": 0.2374958020095092, "rougeLsum_precision_stderr": 0.0038790177970112016, "rougeLsum_recall": 0.2104711045184697, "rougeLsum_recall_stderr": 0.0031578834852280147}}, "4": {"tldr_en": {"bleu": 0.18727975729565796, "bleu_stderr": 0.024743677554065382, "rouge1_fmeasure": 0.06369530530813045, "rouge1_fmeasure_stderr": 0.002225322220412946, "rouge1_precision": 0.08736393590844754, "rouge1_precision_stderr": 0.003320236333651585, "rouge1_recall": 0.07170734676784853, "rouge1_recall_stderr": 0.0026863536115591326, "rouge2_fmeasure": 0.018728966913288573, "rouge2_fmeasure_stderr": 0.0009803445596888972, "rouge2_precision": 0.026539844024013316, "rouge2_precision_stderr": 0.0016227872678637359, "rouge2_recall": 0.021485803029836836, "rouge2_recall_stderr": 0.0012234458979275497, "rougeL_fmeasure": 0.0499467876025126, "rougeL_fmeasure_stderr": 0.0017576059173842731, "rougeL_precision": 0.07026197905523278, "rougeL_precision_stderr": 0.002775489585285312, "rougeL_recall": 0.056214726744091746, "rougeL_recall_stderr": 0.0021292176604441565, "rougeLsum_fmeasure": 0.05976166581566339, "rougeLsum_fmeasure_stderr": 0.0020932837351865146, "rougeLsum_precision": 0.08238350525240551, "rougeLsum_precision_stderr": 0.00315393172421982, "rougeLsum_recall": 0.06716878885751017, "rougeLsum_recall_stderr": 0.002524004286911468}}, "5": {"tldr_en": {"bleu": 5.577157241826622e-13, "bleu_stderr": 1.1569346422902409e-11, "rouge1_fmeasure": 0.01043565444271075, "rouge1_fmeasure_stderr": 0.0010239464330389257, "rouge1_precision": 0.016308055800076632, "rouge1_precision_stderr": 0.0016664962689093485, "rouge1_recall": 0.011189655817252247, "rouge1_recall_stderr": 0.0011802896184760587, "rouge2_fmeasure": 0.003212219718708495, "rouge2_fmeasure_stderr": 0.00044504346236305713, "rouge2_precision": 0.004996639391364696, "rouge2_precision_stderr": 0.0007412751630573523, "rouge2_recall": 0.0035850495924414864, "rouge2_recall_stderr": 0.0005653334006649237, "rougeL_fmeasure": 0.008463888867128002, "rougeL_fmeasure_stderr": 0.0008498965362098245, "rougeL_precision": 0.013336917042039756, "rougeL_precision_stderr": 0.0013953475029781011, "rougeL_recall": 0.009180912567410115, "rougeL_recall_stderr": 0.001001081279334972, "rougeLsum_fmeasure": 0.009948416660924693, "rougeLsum_fmeasure_stderr": 0.000978895492286139, "rougeLsum_precision": 0.015695607203614483, "rougeLsum_precision_stderr": 0.0016199770379973, "rougeLsum_recall": 0.010692369328543307, "rougeLsum_recall_stderr": 0.0011385128958340267}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.1932091730586953, "bleu_stderr": 0.12204583279523666, "rouge1_fmeasure": 0.24046689564492205, "rouge1_fmeasure_stderr": 0.0026584396733124543, "rouge1_precision": 0.41337918705325344, "rouge1_precision_stderr": 0.0052760059693995, "rouge1_recall": 0.2309238228257254, "rouge1_recall_stderr": 0.0032774148724919335, "rouge2_fmeasure": 0.08042598706024275, "rouge2_fmeasure_stderr": 0.001263671869367697, "rouge2_precision": 0.23463707329236097, "rouge2_precision_stderr": 0.0062472430372265, "rouge2_recall": 0.07811161999290202, "rouge2_recall_stderr": 0.001551040325003802, "rougeL_fmeasure": 0.18994590486023305, "rougeL_fmeasure_stderr": 0.0016519612628498022, "rougeL_precision": 0.3599428455550758, "rougeL_precision_stderr": 0.005381765248981534, "rougeL_recall": 0.179931821205033, "rougeL_recall_stderr": 0.002310618989620828, "rougeLsum_fmeasure": 0.2231487737804936, "rougeLsum_fmeasure_stderr": 0.0024418581547524815, "rougeLsum_precision": 0.39508222499421797, "rougeLsum_precision_stderr": 0.005341940324256954, "rougeLsum_recall": 0.21269354787389044, "rougeLsum_recall_stderr": 0.003034851683547011}}, "1": {"generate_text_restaurant": {"bleu": 12.375900606698423, "bleu_stderr": 0.19606823444326352, "rouge1_fmeasure": 0.48293554827595836, "rouge1_fmeasure_stderr": 0.0023435879912405283, "rouge1_precision": 0.597454575764657, "rouge1_precision_stderr": 0.0032329061443020345, "rouge1_recall": 0.4434643604110928, "rouge1_recall_stderr": 0.0029889149083277117, "rouge2_fmeasure": 0.23118450232385498, "rouge2_fmeasure_stderr": 0.0020415023824907675, "rouge2_precision": 0.2910060712314691, "rouge2_precision_stderr": 0.0027476178598673066, "rouge2_recall": 0.21176179817940102, "rouge2_recall_stderr": 0.0021589328308703566, "rougeL_fmeasure": 0.3518994031108773, "rougeL_fmeasure_stderr": 0.0021201781748616372, "rougeL_precision": 0.43888436474875336, "rougeL_precision_stderr": 0.0030457086517011057, "rougeL_recall": 0.3220136107735307, "rougeL_recall_stderr": 0.0024545810076108667, "rougeLsum_fmeasure": 0.39555654566655024, "rougeLsum_fmeasure_stderr": 0.0023470162024293896, "rougeLsum_precision": 0.4908609045585646, "rougeLsum_precision_stderr": 0.003223896911814539, "rougeLsum_recall": 0.3626278022836946, "rougeLsum_recall_stderr": 0.0027433883081054507}}, "2": {"generate_text_restaurant": {"bleu": 14.369325725727359, "bleu_stderr": 0.16247647633686452, "rouge1_fmeasure": 0.5050896808334919, "rouge1_fmeasure_stderr": 0.0022820908410676275, "rouge1_precision": 0.6089651864273457, "rouge1_precision_stderr": 0.003198435500725766, "rouge1_recall": 0.4684512148933178, "rouge1_recall_stderr": 0.0029309075224427496, "rouge2_fmeasure": 0.25360971138111243, "rouge2_fmeasure_stderr": 0.0021297710933538033, "rouge2_precision": 0.31006692351264853, "rouge2_precision_stderr": 0.002823237008286986, "rouge2_recall": 0.23489561514574075, "rouge2_recall_stderr": 0.0022593518873055855, "rougeL_fmeasure": 0.3737642937216708, "rougeL_fmeasure_stderr": 0.002190131174828545, "rougeL_precision": 0.4528063450828851, "rougeL_precision_stderr": 0.0030647195241319818, "rougeL_recall": 0.34593233406974633, "rougeL_recall_stderr": 0.0025245059893910897, "rougeLsum_fmeasure": 0.4229068329404585, "rougeLsum_fmeasure_stderr": 0.0023792336336982954, "rougeLsum_precision": 0.5104756682075146, "rougeLsum_precision_stderr": 0.0032393352108801977, "rougeLsum_recall": 0.39199556131381524, "rougeLsum_recall_stderr": 0.0027868585721285742}}, "3": {"generate_text_restaurant": {"bleu": 15.229236548887188, "bleu_stderr": 0.1893298480282073, "rouge1_fmeasure": 0.5094328850058195, "rouge1_fmeasure_stderr": 0.0023113991646470547, "rouge1_precision": 0.6019638812498068, "rouge1_precision_stderr": 0.003154041355931588, "rouge1_recall": 0.4769354854853059, "rouge1_recall_stderr": 0.002958348996182155, "rouge2_fmeasure": 0.2589859331129998, "rouge2_fmeasure_stderr": 0.0021744147637838404, "rouge2_precision": 0.30884118560646173, "rouge2_precision_stderr": 0.002746739827702377, "rouge2_recall": 0.24294392646302612, "rouge2_recall_stderr": 0.002367727486010716, "rougeL_fmeasure": 0.3764193148894467, "rougeL_fmeasure_stderr": 0.002186561194132359, "rougeL_precision": 0.4461308937388599, "rougeL_precision_stderr": 0.0029602114104730864, "rougeL_recall": 0.3521903496319005, "rougeL_recall_stderr": 0.002557452362321216, "rougeLsum_fmeasure": 0.428648276943535, "rougeLsum_fmeasure_stderr": 0.00241888010522652, "rougeLsum_precision": 0.5062370391801149, "rougeLsum_precision_stderr": 0.003164925596063054, "rougeLsum_recall": 0.4014335787816741, "rougeLsum_recall_stderr": 0.002844420919383205}}, "4": {"generate_text_restaurant": {"bleu": 15.5976388656912, "bleu_stderr": 0.21552554445196853, "rouge1_fmeasure": 0.5121277417214488, "rouge1_fmeasure_stderr": 0.0023435839886818115, "rouge1_precision": 0.5995295914989719, "rouge1_precision_stderr": 0.003160879006436628, "rouge1_recall": 0.48298984835020653, "rouge1_recall_stderr": 0.002984598478781118, "rouge2_fmeasure": 0.2613413893640116, "rouge2_fmeasure_stderr": 0.0022244910107010367, "rouge2_precision": 0.30839025650068297, "rouge2_precision_stderr": 0.002750823115926885, "rouge2_recall": 0.24686659194192842, "rouge2_recall_stderr": 0.0024081433509198598, "rougeL_fmeasure": 0.37763811148528253, "rougeL_fmeasure_stderr": 0.002256869338791484, "rougeL_precision": 0.4436724332909479, "rougeL_precision_stderr": 0.0029870463284766544, "rougeL_recall": 0.35568244613898303, "rougeL_recall_stderr": 0.0026122906695223214, "rougeLsum_fmeasure": 0.43177980537098043, "rougeLsum_fmeasure_stderr": 0.0024732350477868438, "rougeLsum_precision": 0.5052162393739371, "rougeLsum_precision_stderr": 0.0031736753796748323, "rougeLsum_recall": 0.4072510388482175, "rougeLsum_recall_stderr": 0.0029018213850476}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7872267963517905, "bleu_stderr": 0.10045358377651255, "rouge1_fmeasure": 0.19062853936311658, "rouge1_fmeasure_stderr": 0.0029532646040511964, "rouge1_precision": 0.13999986827582567, "rouge1_precision_stderr": 0.0022474794503698063, "rouge1_recall": 0.3146848228074294, "rouge1_recall_stderr": 0.004975343003506539, "rouge2_fmeasure": 0.04328828501096498, "rouge2_fmeasure_stderr": 0.001566248398707883, "rouge2_precision": 0.03163976089485668, "rouge2_precision_stderr": 0.0011861306107240704, "rouge2_recall": 0.07306747424744006, "rouge2_recall_stderr": 0.002675800285732193, "rougeL_fmeasure": 0.139911595168302, "rougeL_fmeasure_stderr": 0.0021985427116768684, "rougeL_precision": 0.10267416140662887, "rougeL_precision_stderr": 0.0016806410262001903, "rougeL_recall": 0.23218112607700236, "rougeL_recall_stderr": 0.003798988027549183, "rougeLsum_fmeasure": 0.15161941662067227, "rougeLsum_fmeasure_stderr": 0.0024594718320593506, "rougeLsum_precision": 0.11113494233445374, "rougeLsum_precision_stderr": 0.0018618965914790858, "rougeLsum_recall": 0.2517399727563149, "rougeLsum_recall_stderr": 0.004226741179316262}}, "1": {"article_DOC_summary": {"bleu": 2.0535765112268263, "bleu_stderr": 0.11972597685190434, "rouge1_fmeasure": 0.21134911198456563, "rouge1_fmeasure_stderr": 0.003029508784148872, "rouge1_precision": 0.198389498815838, "rouge1_precision_stderr": 0.0036200528310182637, "rouge1_recall": 0.27297271430803394, "rouge1_recall_stderr": 0.004198789938626886, "rouge2_fmeasure": 0.04534620252044498, "rouge2_fmeasure_stderr": 0.0017858055311589297, "rouge2_precision": 0.043045220973661015, "rouge2_precision_stderr": 0.0018919242792135235, "rouge2_recall": 0.0597065055598483, "rouge2_recall_stderr": 0.0023731544903471393, "rougeL_fmeasure": 0.16064653466424578, "rougeL_fmeasure_stderr": 0.0024169691348135945, "rougeL_precision": 0.15080384564385396, "rougeL_precision_stderr": 0.0028920115769217117, "rougeL_recall": 0.2086058984013953, "rougeL_recall_stderr": 0.0033970509765821025, "rougeLsum_fmeasure": 0.1633349245997364, "rougeLsum_fmeasure_stderr": 0.002468028232419068, "rougeLsum_precision": 0.15275988244015698, "rougeLsum_precision_stderr": 0.0028982643948039046, "rougeLsum_recall": 0.21320242569675277, "rougeLsum_recall_stderr": 0.0035810539802842534}}, "2": {"article_DOC_summary": {"bleu": 3.0737243948901085, "bleu_stderr": 0.20707696514444188, "rouge1_fmeasure": 0.2444857696168434, "rouge1_fmeasure_stderr": 0.0034588270505411917, "rouge1_precision": 0.2607761013228957, "rouge1_precision_stderr": 0.004351702830837392, "rouge1_recall": 0.2607885042839857, "rouge1_recall_stderr": 0.004018215060642568, "rouge2_fmeasure": 0.05852877798479666, "rouge2_fmeasure_stderr": 0.0023072565365240597, "rouge2_precision": 0.06344356630530605, "rouge2_precision_stderr": 0.0026820157377482148, "rouge2_recall": 0.06209709779478734, "rouge2_recall_stderr": 0.00246191601941908, "rougeL_fmeasure": 0.18609020665428283, "rougeL_fmeasure_stderr": 0.002889852915757798, "rougeL_precision": 0.19907633090246687, "rougeL_precision_stderr": 0.0036157515137942723, "rougeL_recall": 0.19814987180436297, "rougeL_recall_stderr": 0.003287204314003516, "rougeLsum_fmeasure": 0.18820566058762211, "rougeLsum_fmeasure_stderr": 0.002901279776307625, "rougeLsum_precision": 0.20074551849299913, "rougeLsum_precision_stderr": 0.0036053665449958016, "rougeLsum_recall": 0.20155142761693595, "rougeLsum_recall_stderr": 0.003414189551327218}}, "3": {"article_DOC_summary": {"bleu": 3.3049815114104333, "bleu_stderr": 0.159904818084904, "rouge1_fmeasure": 0.23929443585701224, "rouge1_fmeasure_stderr": 0.0037168757909286886, "rouge1_precision": 0.26517071973676093, "rouge1_precision_stderr": 0.004671054222681604, "rouge1_recall": 0.24180770628858178, "rouge1_recall_stderr": 0.003940301667285207, "rouge2_fmeasure": 0.05849869572638163, "rouge2_fmeasure_stderr": 0.0023240414297435643, "rouge2_precision": 0.06573984745309881, "rouge2_precision_stderr": 0.0027796960958526426, "rouge2_recall": 0.05833153123977987, "rouge2_recall_stderr": 0.0023156387603366626, "rougeL_fmeasure": 0.1821366529554913, "rougeL_fmeasure_stderr": 0.0030775956513009466, "rougeL_precision": 0.203598693888488, "rougeL_precision_stderr": 0.0039976969321419975, "rougeL_recall": 0.18373116025857217, "rougeL_recall_stderr": 0.003225733424944883, "rougeLsum_fmeasure": 0.1835081526249607, "rougeLsum_fmeasure_stderr": 0.003085655347849612, "rougeLsum_precision": 0.20485233666950228, "rougeLsum_precision_stderr": 0.003995574052859589, "rougeLsum_recall": 0.1854022097149016, "rougeLsum_recall_stderr": 0.003252182345953945}}, "4": {"article_DOC_summary": {"bleu": 0.14790698979130756, "bleu_stderr": 0.04384286085154373, "rouge1_fmeasure": 0.06274888299344217, "rouge1_fmeasure_stderr": 0.00368347747261438, "rouge1_precision": 0.07320382597863412, "rouge1_precision_stderr": 0.004382339503654509, "rouge1_recall": 0.06154298544226265, "rouge1_recall_stderr": 0.0037288317198754368, "rouge2_fmeasure": 0.01636894923247962, "rouge2_fmeasure_stderr": 0.0015408484981787023, "rouge2_precision": 0.018885269050898535, "rouge2_precision_stderr": 0.0018067685701354235, "rouge2_recall": 0.016055891153780352, "rouge2_recall_stderr": 0.0015325179921096511, "rougeL_fmeasure": 0.04906837355261336, "rougeL_fmeasure_stderr": 0.002968163725283441, "rougeL_precision": 0.057742538854948004, "rougeL_precision_stderr": 0.003573097680574354, "rougeL_recall": 0.0479152284127263, "rougeL_recall_stderr": 0.0029805271711328637, "rougeLsum_fmeasure": 0.04921620084471268, "rougeLsum_fmeasure_stderr": 0.0029754368968071495, "rougeLsum_precision": 0.05793234813776215, "rougeLsum_precision_stderr": 0.0035819297580720103, "rougeLsum_recall": 0.04803187451993844, "rougeLsum_recall_stderr": 0.0029874284500312834}}, "5": {"article_DOC_summary": {"bleu": 1.0193423208210636e-40, "bleu_stderr": 1.482688743257922e-35, "rouge1_fmeasure": 0.0021324501644318945, "rouge1_fmeasure_stderr": 0.0005815454829014453, "rouge1_precision": 0.002313901471635568, "rouge1_precision_stderr": 0.0006326582061365616, "rouge1_recall": 0.0020282790639805184, "rouge1_recall_stderr": 0.0005520696211086723, "rouge2_fmeasure": 0.0003960521179313103, "rouge2_fmeasure_stderr": 0.00017805195394425088, "rouge2_precision": 0.0004152380160032476, "rouge2_precision_stderr": 0.00018397799224345848, "rouge2_recall": 0.00038092195139919884, "rouge2_recall_stderr": 0.00017352719125781168, "rougeL_fmeasure": 0.0018203546217228452, "rougeL_fmeasure_stderr": 0.0005047529402091446, "rougeL_precision": 0.0019597074784293666, "rougeL_precision_stderr": 0.0005429844277966419, "rougeL_recall": 0.0017448307905115785, "rougeL_recall_stderr": 0.00048409378403815574, "rougeLsum_fmeasure": 0.0019229642762192925, "rougeLsum_fmeasure_stderr": 0.0005389418518153393, "rougeLsum_precision": 0.0020637585328133834, "rougeLsum_precision_stderr": 0.0005760004715550667, "rougeLsum_recall": 0.0018460791228919861, "rougeLsum_recall_stderr": 0.0005185190358660706}}}}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..faf1de1e1ad17ab20141d34787da161b95829699
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.3920095188313853,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05821741416066471
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.07689265316885617,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0021889644764877354
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3353633364771332,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005464034559346198
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.10974192875795953,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002108481379704356
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.03467177168437793,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.001263966142663848
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1501022471408039,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0034656326105627303
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.04944058280627724,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012733709222344247
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.07300762564486572,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0019791385357395687
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3239944481490108,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.005278665955249544
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.10487292177099015,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0019145881199173131
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.07173881674469755,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020327580166630914
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3108299199198791,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0050441398235876655
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.10223587057095268,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001953957091865035
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a06ddccdd31bd5863cf096563a60b59544b4de1e
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5923583934046589,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04413735939354442
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.16010050130964712,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005125042778102285
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3196355072168386,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005119850964520928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.1697750677390444,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0038772039426472724
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.08438981052215226,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0035428710754014017
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.16375709757632376,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0035801758170607334
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.08671853497591266,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002631475278488475
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.14464852171051198,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004587920039107864
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2991072637349091,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004697289445064134
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.15421967175389004,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003335409750319338
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.14773643541948675,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004697318924441809
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3022253960230997,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004739701115545982
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.15698432412836977,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0034258507511626024
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c15805902f88450646b0427a703e8ca82e6d5e8
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.87481699127398,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.059582002804657114
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.199466981650156,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005680133053279315
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.36987834122063556,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004990035433475184
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.2091307571621392,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.00438758838932181
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.1097332483868951,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.003880260643680688
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.19989822315140882,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003891995132588205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.1125014602113282,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0030818329485191713
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.17708218948059182,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004959206002368757
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.34473442974785634,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004653107328809787
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.18777604196623215,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0037686741944170416
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.18275767079289593,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005158519349266604
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3495549840141228,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00470047440035223
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.19268792375031765,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00391282714541826
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7b15ce77a0c72ca87603cdece4155e8b5beb25cc
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.917052522385174,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04515993268519611
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.20888904165496103,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005880585457367179
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.37814776640994463,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004965150831364198
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.21372711407619938,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004343895044489938
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.1168574620986129,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004100382644887659
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.2033520821796969,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00383298408380243
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.11478946258154298,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003021059673039041
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.18605811290952207,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005155344402670075
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.35240104768920266,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004560857947768766
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.19232862909601875,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0036812800257055574
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.19147784744930493,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0053469173736333826
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3571044253500232,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004630285578564224
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.1967270339578505,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0038181408662667426
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d7b6a9891726377525988a802cb6799d384cb8e
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.0770812969732624,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.057150178907157206
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.22981660265238718,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0060091983198828705
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.39885875556534467,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004954969825753986
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.23556795007723105,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004533277698287348
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.12750847088352804,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004054232891970157
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21938886368566654,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003991587985278903
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.12790770467752932,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0031721283738045283
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.2007562947847461,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005097419240769892
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3691187056685199,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004545072578815252
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.20953786614403025,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0038198330880285674
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.2090073734779189,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0053710149363667935
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3759411168888221,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004622209838017953
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.21611300792836619,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003996019817069793
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d01c7bed5f1b84ce1c8868a18272d98cf0f28785
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 1.1504048600305352,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0755822004199928
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.24380689117778276,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.006371373530201665
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3952201821451148,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.005033224197794555
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.24203388956598934,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.004720213442449205
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.1418504594039488,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.004575582136553598
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.21936500213450158,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.004020020751334439
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.13474976693002946,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.003415392488984557
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.21441718014302874,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.005537317704553182
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.36454653195485753,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004645366026599087
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.21539137146469606,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.004028386529865914
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.22298936285767557,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005807349708841443
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.37135465454110034,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004699420865300012
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.222229551042884,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.004199234047990457
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9b1ee90f0cd9d46125c774ad1fb84fe2e9a1669
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.13065355742673593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003004776280579319
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.1786059794590456,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003480497255310532
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.13547708241929612,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026556069493799896
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.03291780593276675,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0010760829551416996
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04663674589032859,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001504992918818976
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0352234336154376,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0010798725770486046
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.09987150235228559,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0023685337113851174
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.13959473564876165,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0027581190918920103
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.10336654675385076,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001968971023431868
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.12288788441024788,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0028720387887482485
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.16759622234914887,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0032867178590762366
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.12705867398208964,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002506787857940285
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.0868626526463756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.10334620651927755
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..08cf79c054c944ab6ff084e9c30269a6ebe3f298
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.19126686092152348,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003082884423906421
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.2013507500089625,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029925213818961095
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.16580650031794167,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021193004105035614
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.044582320866862424,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0018321118047088484
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04442769282883079,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0014782140733801531
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.035746404962808155,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011192977008782524
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.14988887413463797,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002586002929489674
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.15563614848002308,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0023345594461348924
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1274347165087169,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015926173758816556
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.18119105294163743,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.002938341173461725
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.19011415446724061,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002811944286183604
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.15656227686437413,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019872168695675768
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.5514775583592173,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.11284261690010776
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7199fdbdecbaf9be52b97e5978f832ec9799730
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.27518353034811777,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003681886785126964
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.26994010795497553,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003076759192772023
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.227527675745211,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002262727638760964
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.08493527530542791,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002342907743897752
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.07562522541046568,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0016670921194054875
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.06444156598905733,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001316135695695516
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.21378447499783487,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003086057725507419
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.20706303996123673,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.002420883132061566
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.17356357206141904,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017138094026774846
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2594677476381253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035263951440822477
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.25382240907884707,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029022538349274
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.21385148741041204,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0021335293397074695
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 4.160843322149366,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.1298621943311205
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2abe43ec49c73bb3d6693c4b9e96bad1d155530
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.25197379414278026,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004061114481390069
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.2241545120711431,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.003349453782690701
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.19542809253461213,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002567437085860653
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.07748731951936201,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024143985835070676
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.06396850914384923,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.001702594494628315
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.055649339745118376,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0013436864732712206
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.19629979985917206,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00335622576475892
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.17220070773118773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026273647249788665
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1493802268370947,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.001958022255827534
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2374958020095092,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0038790177970112016
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.2104711045184697,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0031578834852280147
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18360399651239948,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002422426009983074
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 3.550269268318738,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.123976667805652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..641477d20d4b910b484ff42ab0f9f2935f7bd89b
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.08736393590844754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.003320236333651585
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.07170734676784853,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0026863536115591326
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.06369530530813045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002225322220412946
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.026539844024013316,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0016227872678637359
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.021485803029836836,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012234458979275497
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.018728966913288573,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009803445596888972
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.07026197905523278,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.002775489585285312
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.056214726744091746,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021292176604441565
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.0499467876025126,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017576059173842731
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.08238350525240551,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00315393172421982
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.06716878885751017,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002524004286911468
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.05976166581566339,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020932837351865146
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.18727975729565796,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.024743677554065382
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e969c4b63635f130c10e22a8121d7ee914b777aa
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.016308055800076632,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0016664962689093485
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.011189655817252247,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0011802896184760587
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.01043565444271075,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0010239464330389257
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.004996639391364696,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0007412751630573523
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.0035850495924414864,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0005653334006649237
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.003212219718708495,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00044504346236305713
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.013336917042039756,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0013953475029781011
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.009180912567410115,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.001001081279334972
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.008463888867128002,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0008498965362098245
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.015695607203614483,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0016199770379973
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.010692369328543307,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0011385128958340267
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.009948416660924693,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.000978895492286139
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 5.577157241826622e-13,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 1.1569346422902409e-11
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..033b06edbd3d76e2b5c32ec30c7a128afea6298f
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 3.1932091730586953,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.12204583279523666
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.41337918705325344,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0052760059693995
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.2309238228257254,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0032774148724919335
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.24046689564492205,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0026584396733124543
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.23463707329236097,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0062472430372265
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.07811161999290202,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.001551040325003802
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.08042598706024275,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.001263671869367697
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.3599428455550758,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.005381765248981534
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.179931821205033,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002310618989620828
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.18994590486023305,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016519612628498022
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.39508222499421797,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.005341940324256954
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.21269354787389044,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.003034851683547011
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.2231487737804936,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024418581547524815
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..41661b49f7324a69e36634ee50945205a7e076b1
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 12.375900606698423,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.19606823444326352
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.597454575764657,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.0032329061443020345
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4434643604110928,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029889149083277117
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.48293554827595836,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023435879912405283
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.2910060712314691,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.0027476178598673066
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.21176179817940102,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0021589328308703566
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.23118450232385498,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020415023824907675
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.43888436474875336,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030457086517011057
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3220136107735307,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0024545810076108667
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3518994031108773,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0021201781748616372
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.4908609045585646,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003223896911814539
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.3626278022836946,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027433883081054507
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.39555654566655024,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023470162024293896
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a102562da98dff96a2b2a73c2d97472a204b453
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 14.369325725727359,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.16247647633686452
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.6089651864273457,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003198435500725766
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4684512148933178,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.0029309075224427496
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5050896808334919,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0022820908410676275
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.31006692351264853,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002823237008286986
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.23489561514574075,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0022593518873055855
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.25360971138111243,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021297710933538033
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4528063450828851,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0030647195241319818
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.34593233406974633,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0025245059893910897
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3737642937216708,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002190131174828545
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.5104756682075146,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0032393352108801977
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.39199556131381524,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0027868585721285742
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.4229068329404585,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0023792336336982954
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d22b52b1565deb2404f714b9ebc6f0d166d84aa7
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.229236548887188,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.1893298480282073
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.6019638812498068,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003154041355931588
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.4769354854853059,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002958348996182155
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5094328850058195,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023113991646470547
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30884118560646173,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002746739827702377
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24294392646302612,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.002367727486010716
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2589859331129998,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021744147637838404
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4461308937388599,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029602114104730864
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.3521903496319005,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.002557452362321216
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.3764193148894467,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002186561194132359
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.5062370391801149,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003164925596063054
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4014335787816741,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.002844420919383205
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.428648276943535,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.00241888010522652
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..e64f6299ae2f22ace82f856b5b9ace8fb0d4a246
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_e2e_nlg_cleaned_generate_text_restaurant_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "bleu": 15.5976388656912,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "bleu_stderr": 0.21552554445196853
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_precision": 0.5995295914989719,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_precision_stderr": 0.003160879006436628
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_recall": 0.48298984835020653,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_recall_stderr": 0.002984598478781118
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge1_fmeasure": 0.5121277417214488,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0023435839886818115
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_precision": 0.30839025650068297,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_precision_stderr": 0.002750823115926885
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_recall": 0.24686659194192842,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_recall_stderr": 0.0024081433509198598
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rouge2_fmeasure": 0.2613413893640116,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0022244910107010367
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_precision": 0.4436724332909479,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_precision_stderr": 0.0029870463284766544
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_recall": 0.35568244613898303,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_recall_stderr": 0.0026122906695223214
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeL_fmeasure": 0.37763811148528253,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.002256869338791484
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_precision": 0.5052162393739371,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0031736753796748323
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_recall": 0.4072510388482175,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0029018213850476
+    },
+    {
+      "task_name": "e2e_nlg_cleaned",
+      "prompt_name": "generate_text_restaurant",
+      "rougeLsum_fmeasure": 0.43177980537098043,
+      "dataset_path": "e2e_nlg_cleaned",
+      "dataset_name": null,
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0024732350477868438
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_0.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6481ef94ac7568d20e012f1be6d1cd4784931f7a
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.13999986827582567,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0022474794503698063
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.3146848228074294,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004975343003506539
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.19062853936311658,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0029532646040511964
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.03163976089485668,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0011861306107240704
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.07306747424744006,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.002675800285732193
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04328828501096498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.001566248398707883
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.10267416140662887,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0016806410262001903
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.23218112607700236,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003798988027549183
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.139911595168302,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0021985427116768684
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.11113494233445374,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0018618965914790858
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.2517399727563149,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.004226741179316262
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.15161941662067227,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0024594718320593506
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.7872267963517905,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.10045358377651255
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_1.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6f06abc33aa6ceb75bf2ef1df7cef7475885a8e
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.198389498815838,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0036200528310182637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.27297271430803394,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004198789938626886
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.21134911198456563,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.003029508784148872
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.043045220973661015,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018919242792135235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0597065055598483,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023731544903471393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.04534620252044498,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0017858055311589297
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.15080384564385396,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0028920115769217117
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.2086058984013953,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0033970509765821025
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.16064653466424578,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0024169691348135945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.15275988244015698,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0028982643948039046
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.21320242569675277,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0035810539802842534
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1633349245997364,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002468028232419068
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 2.0535765112268263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.11972597685190434
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_2.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2d8473683aab30c50f3de98e835090cab3b563e
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.2607761013228957,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004351702830837392
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.2607885042839857,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.004018215060642568
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.2444857696168434,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0034588270505411917
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.06344356630530605,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0026820157377482148
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.06209709779478734,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00246191601941908
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05852877798479666,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0023072565365240597
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.19907633090246687,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0036157515137942723
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.19814987180436297,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003287204314003516
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.18609020665428283,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002889852915757798
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.20074551849299913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0036053665449958016
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.20155142761693595,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003414189551327218
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.18820566058762211,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002901279776307625
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 3.0737243948901085,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.20707696514444188
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_3.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7f0128806ec27513f562bb5c568e99d368a7a5d
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.26517071973676093,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004671054222681604
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.24180770628858178,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003940301667285207
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.23929443585701224,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0037168757909286886
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.06573984745309881,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0027796960958526426
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.05833153123977987,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0023156387603366626
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.05849869572638163,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0023240414297435643
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.203598693888488,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0039976969321419975
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.18373116025857217,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.003225733424944883
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.1821366529554913,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0030775956513009466
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.20485233666950228,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.003995574052859589
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.1854022097149016,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.003252182345953945
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.1835081526249607,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.003085655347849612
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 3.3049815114104333,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.159904818084904
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_4.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea41e9a9a5900b8c99abf372a4015cc3186a1c25
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.07320382597863412,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.004382339503654509
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.06154298544226265,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0037288317198754368
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.06274888299344217,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.00368347747261438
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.018885269050898535,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0018067685701354235
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.016055891153780352,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0015325179921096511
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.01636894923247962,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0015408484981787023
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.057742538854948004,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.003573097680574354
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0479152284127263,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0029805271711328637
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.04906837355261336,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.002968163725283441
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.05793234813776215,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0035819297580720103
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.04803187451993844,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0029874284500312834
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.04921620084471268,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0029754368968071495
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.14790698979130756,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.04384286085154373
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_5.json b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..26689e0d955ed00b9a3ce86d918ac82147bdcd9d
--- /dev/null
+++ b/4b284b42boscar/evaluation/generation/slim.4b284b42boscar_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.002313901471635568,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0006326582061365616
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0020282790639805184,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0005520696211086723
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0021324501644318945,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0005815454829014453
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0004152380160032476,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.00018397799224345848
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.00038092195139919884,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.00017352719125781168
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0003960521179313103,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.00017805195394425088
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0019597074784293666,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0005429844277966419
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0017448307905115785,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.00048409378403815574
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0018203546217228452,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0005047529402091446
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0020637585328133834,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0005760004715550667
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0018460791228919861,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0005185190358660706
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0019229642762192925,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0005389418518153393
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 1.0193423208210636e-40,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 1.482688743257922e-35
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-oscar-repetitions/4b284b42boscar/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_0.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..425f82ff4ec7518ae22f852496522a51cecd6b15
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_0.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.343,
+            "acc_stderr": 0.015019206922356951
+        },
+        "anli_r2": {
+            "acc": 0.344,
+            "acc_stderr": 0.015029633724408945
+        },
+        "anli_r3": {
+            "acc": 0.3491666666666667,
+            "acc_stderr": 0.013767075395077244
+        },
+        "cb": {
+            "acc": 0.32142857142857145,
+            "acc_stderr": 0.06297362289056341,
+            "f1": 0.1884169884169884
+        },
+        "copa": {
+            "acc": 0.77,
+            "acc_stderr": 0.04229525846816506
+        },
+        "hellaswag": {
+            "acc": 0.40818562039434375,
+            "acc_stderr": 0.0049049335002558855,
+            "acc_norm": 0.5161322445727943,
+            "acc_norm_stderr": 0.004987183560792757
+        },
+        "rte": {
+            "acc": 0.5415162454873647,
+            "acc_stderr": 0.029992535385373317
+        },
+        "winogrande": {
+            "acc": 0.5540647198105761,
+            "acc_stderr": 0.01397009348233069
+        },
+        "storycloze_2016": {
+            "acc": 0.6841261357562801,
+            "acc_stderr": 0.01074989282701111
+        },
+        "boolq": {
+            "acc": 0.5568807339449541,
+            "acc_stderr": 0.008688282882073796
+        },
+        "arc_easy": {
+            "acc": 0.5736531986531986,
+            "acc_stderr": 0.010147858603835136,
+            "acc_norm": 0.5113636363636364,
+            "acc_norm_stderr": 0.010257133441117103
+        },
+        "arc_challenge": {
+            "acc": 0.2525597269624573,
+            "acc_stderr": 0.012696728980207708,
+            "acc_norm": 0.28242320819112626,
+            "acc_norm_stderr": 0.013155456884097224
+        },
+        "sciq": {
+            "acc": 0.855,
+            "acc_stderr": 0.011139977517890148,
+            "acc_norm": 0.772,
+            "acc_norm_stderr": 0.013273740700804474
+        },
+        "piqa": {
+            "acc": 0.7247007616974973,
+            "acc_stderr": 0.01042142927736953,
+            "acc_norm": 0.7323177366702938,
+            "acc_norm_stderr": 0.010330111189370418
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..425f82ff4ec7518ae22f852496522a51cecd6b15
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.343,
+            "acc_stderr": 0.015019206922356951
+        },
+        "anli_r2": {
+            "acc": 0.344,
+            "acc_stderr": 0.015029633724408945
+        },
+        "anli_r3": {
+            "acc": 0.3491666666666667,
+            "acc_stderr": 0.013767075395077244
+        },
+        "cb": {
+            "acc": 0.32142857142857145,
+            "acc_stderr": 0.06297362289056341,
+            "f1": 0.1884169884169884
+        },
+        "copa": {
+            "acc": 0.77,
+            "acc_stderr": 0.04229525846816506
+        },
+        "hellaswag": {
+            "acc": 0.40818562039434375,
+            "acc_stderr": 0.0049049335002558855,
+            "acc_norm": 0.5161322445727943,
+            "acc_norm_stderr": 0.004987183560792757
+        },
+        "rte": {
+            "acc": 0.5415162454873647,
+            "acc_stderr": 0.029992535385373317
+        },
+        "winogrande": {
+            "acc": 0.5540647198105761,
+            "acc_stderr": 0.01397009348233069
+        },
+        "storycloze_2016": {
+            "acc": 0.6841261357562801,
+            "acc_stderr": 0.01074989282701111
+        },
+        "boolq": {
+            "acc": 0.5568807339449541,
+            "acc_stderr": 0.008688282882073796
+        },
+        "arc_easy": {
+            "acc": 0.5736531986531986,
+            "acc_stderr": 0.010147858603835136,
+            "acc_norm": 0.5113636363636364,
+            "acc_norm_stderr": 0.010257133441117103
+        },
+        "arc_challenge": {
+            "acc": 0.2525597269624573,
+            "acc_stderr": 0.012696728980207708,
+            "acc_norm": 0.28242320819112626,
+            "acc_norm_stderr": 0.013155456884097224
+        },
+        "sciq": {
+            "acc": 0.855,
+            "acc_stderr": 0.011139977517890148,
+            "acc_norm": 0.772,
+            "acc_norm_stderr": 0.013273740700804474
+        },
+        "piqa": {
+            "acc": 0.7247007616974973,
+            "acc_stderr": 0.01042142927736953,
+            "acc_norm": 0.7323177366702938,
+            "acc_norm_stderr": 0.010330111189370418
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_1.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..85db393dde577c4e36d14742600b67aaf4c69377
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_1.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.35,
+            "acc_stderr": 0.015090650341444233
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224489
+        },
+        "anli_r3": {
+            "acc": 0.345,
+            "acc_stderr": 0.013728421539454881
+        },
+        "cb": {
+            "acc": 0.35714285714285715,
+            "acc_stderr": 0.0646095738380922,
+            "f1": 0.263246425567704
+        },
+        "copa": {
+            "acc": 0.76,
+            "acc_stderr": 0.04292346959909283
+        },
+        "hellaswag": {
+            "acc": 0.40599482174865564,
+            "acc_stderr": 0.004900798868048131,
+            "acc_norm": 0.5247958573989245,
+            "acc_norm_stderr": 0.004983641854351151
+        },
+        "rte": {
+            "acc": 0.5776173285198556,
+            "acc_stderr": 0.029731622646495887
+        },
+        "winogrande": {
+            "acc": 0.5564325177584846,
+            "acc_stderr": 0.0139626949076204
+        },
+        "storycloze_2016": {
+            "acc": 0.6755745590593266,
+            "acc_stderr": 0.010826131344990888
+        },
+        "boolq": {
+            "acc": 0.5464831804281346,
+            "acc_stderr": 0.008707182331111644
+        },
+        "arc_easy": {
+            "acc": 0.6035353535353535,
+            "acc_stderr": 0.010037412763064524,
+            "acc_norm": 0.5757575757575758,
+            "acc_norm_stderr": 0.010141333654958562
+        },
+        "arc_challenge": {
+            "acc": 0.27047781569965873,
+            "acc_stderr": 0.012980954547659554,
+            "acc_norm": 0.30204778156996587,
+            "acc_norm_stderr": 0.013417519144716412
+        },
+        "sciq": {
+            "acc": 0.897,
+            "acc_stderr": 0.009616833339695796,
+            "acc_norm": 0.893,
+            "acc_norm_stderr": 0.009779910359847169
+        },
+        "piqa": {
+            "acc": 0.7290533188248096,
+            "acc_stderr": 0.010369718937426843,
+            "acc_norm": 0.7301414581066377,
+            "acc_norm_stderr": 0.010356595421852188
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..85db393dde577c4e36d14742600b67aaf4c69377
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.35,
+            "acc_stderr": 0.015090650341444233
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224489
+        },
+        "anli_r3": {
+            "acc": 0.345,
+            "acc_stderr": 0.013728421539454881
+        },
+        "cb": {
+            "acc": 0.35714285714285715,
+            "acc_stderr": 0.0646095738380922,
+            "f1": 0.263246425567704
+        },
+        "copa": {
+            "acc": 0.76,
+            "acc_stderr": 0.04292346959909283
+        },
+        "hellaswag": {
+            "acc": 0.40599482174865564,
+            "acc_stderr": 0.004900798868048131,
+            "acc_norm": 0.5247958573989245,
+            "acc_norm_stderr": 0.004983641854351151
+        },
+        "rte": {
+            "acc": 0.5776173285198556,
+            "acc_stderr": 0.029731622646495887
+        },
+        "winogrande": {
+            "acc": 0.5564325177584846,
+            "acc_stderr": 0.0139626949076204
+        },
+        "storycloze_2016": {
+            "acc": 0.6755745590593266,
+            "acc_stderr": 0.010826131344990888
+        },
+        "boolq": {
+            "acc": 0.5464831804281346,
+            "acc_stderr": 0.008707182331111644
+        },
+        "arc_easy": {
+            "acc": 0.6035353535353535,
+            "acc_stderr": 0.010037412763064524,
+            "acc_norm": 0.5757575757575758,
+            "acc_norm_stderr": 0.010141333654958562
+        },
+        "arc_challenge": {
+            "acc": 0.27047781569965873,
+            "acc_stderr": 0.012980954547659554,
+            "acc_norm": 0.30204778156996587,
+            "acc_norm_stderr": 0.013417519144716412
+        },
+        "sciq": {
+            "acc": 0.897,
+            "acc_stderr": 0.009616833339695796,
+            "acc_norm": 0.893,
+            "acc_norm_stderr": 0.009779910359847169
+        },
+        "piqa": {
+            "acc": 0.7290533188248096,
+            "acc_stderr": 0.010369718937426843,
+            "acc_norm": 0.7301414581066377,
+            "acc_norm_stderr": 0.010356595421852188
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_2.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d8dabcbf07b4eca025f3d021320d02c60f63242
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_2.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095524
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224479
+        },
+        "anli_r3": {
+            "acc": 0.3225,
+            "acc_stderr": 0.013499258621103247
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.2754385964912281
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768079
+        },
+        "hellaswag": {
+            "acc": 0.4095797649870544,
+            "acc_stderr": 0.004907512103128348,
+            "acc_norm": 0.5251941844254132,
+            "acc_norm_stderr": 0.004983442888677763
+        },
+        "rte": {
+            "acc": 0.5306859205776173,
+            "acc_stderr": 0.030039730592197816
+        },
+        "winogrande": {
+            "acc": 0.5548539857932123,
+            "acc_stderr": 0.013967662954355491
+        },
+        "storycloze_2016": {
+            "acc": 0.6761090326028861,
+            "acc_stderr": 0.010821488046867108
+        },
+        "boolq": {
+            "acc": 0.572782874617737,
+            "acc_stderr": 0.008651907722486108
+        },
+        "arc_easy": {
+            "acc": 0.601010101010101,
+            "acc_stderr": 0.01004824068379877,
+            "acc_norm": 0.5904882154882155,
+            "acc_norm_stderr": 0.010090368160990062
+        },
+        "arc_challenge": {
+            "acc": 0.2773037542662116,
+            "acc_stderr": 0.013082095839059374,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.013385021637313572
+        },
+        "sciq": {
+            "acc": 0.908,
+            "acc_stderr": 0.009144376393151108,
+            "acc_norm": 0.905,
+            "acc_norm_stderr": 0.009276910103103313
+        },
+        "piqa": {
+            "acc": 0.7274211099020674,
+            "acc_stderr": 0.010389256803296021,
+            "acc_norm": 0.7241566920565833,
+            "acc_norm_stderr": 0.01042780550272912
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d8dabcbf07b4eca025f3d021320d02c60f63242
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095524
+        },
+        "anli_r2": {
+            "acc": 0.338,
+            "acc_stderr": 0.014965960710224479
+        },
+        "anli_r3": {
+            "acc": 0.3225,
+            "acc_stderr": 0.013499258621103247
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.2754385964912281
+        },
+        "copa": {
+            "acc": 0.74,
+            "acc_stderr": 0.04408440022768079
+        },
+        "hellaswag": {
+            "acc": 0.4095797649870544,
+            "acc_stderr": 0.004907512103128348,
+            "acc_norm": 0.5251941844254132,
+            "acc_norm_stderr": 0.004983442888677763
+        },
+        "rte": {
+            "acc": 0.5306859205776173,
+            "acc_stderr": 0.030039730592197816
+        },
+        "winogrande": {
+            "acc": 0.5548539857932123,
+            "acc_stderr": 0.013967662954355491
+        },
+        "storycloze_2016": {
+            "acc": 0.6761090326028861,
+            "acc_stderr": 0.010821488046867108
+        },
+        "boolq": {
+            "acc": 0.572782874617737,
+            "acc_stderr": 0.008651907722486108
+        },
+        "arc_easy": {
+            "acc": 0.601010101010101,
+            "acc_stderr": 0.01004824068379877,
+            "acc_norm": 0.5904882154882155,
+            "acc_norm_stderr": 0.010090368160990062
+        },
+        "arc_challenge": {
+            "acc": 0.2773037542662116,
+            "acc_stderr": 0.013082095839059374,
+            "acc_norm": 0.29948805460750855,
+            "acc_norm_stderr": 0.013385021637313572
+        },
+        "sciq": {
+            "acc": 0.908,
+            "acc_stderr": 0.009144376393151108,
+            "acc_norm": 0.905,
+            "acc_norm_stderr": 0.009276910103103313
+        },
+        "piqa": {
+            "acc": 0.7274211099020674,
+            "acc_stderr": 0.010389256803296021,
+            "acc_norm": 0.7241566920565833,
+            "acc_norm_stderr": 0.01042780550272912
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_3.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2443f7775c70ce2d20632ea5feea3aa16c0fef51
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_3.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.329,
+            "acc_stderr": 0.01486539538592837
+        },
+        "anli_r2": {
+            "acc": 0.351,
+            "acc_stderr": 0.015100563798316402
+        },
+        "anli_r3": {
+            "acc": 0.35,
+            "acc_stderr": 0.013774667009018558
+        },
+        "cb": {
+            "acc": 0.5357142857142857,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.48858858858858856
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40748854809798846,
+            "acc_stderr": 0.004903628887264533,
+            "acc_norm": 0.5296753634734117,
+            "acc_norm_stderr": 0.004980985384152898
+        },
+        "rte": {
+            "acc": 0.4981949458483754,
+            "acc_stderr": 0.030096267148976626
+        },
+        "winogrande": {
+            "acc": 0.5485398579321231,
+            "acc_stderr": 0.01398611030101776
+        },
+        "storycloze_2016": {
+            "acc": 0.6798503474078034,
+            "acc_stderr": 0.010788532546733105
+        },
+        "boolq": {
+            "acc": 0.5737003058103975,
+            "acc_stderr": 0.008649531625805677
+        },
+        "arc_easy": {
+            "acc": 0.6014309764309764,
+            "acc_stderr": 0.01004645540047794,
+            "acc_norm": 0.5883838383838383,
+            "acc_norm_stderr": 0.010098218646714906
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.01283552390947384,
+            "acc_norm": 0.2935153583617747,
+            "acc_norm_stderr": 0.01330725044494112
+        },
+        "sciq": {
+            "acc": 0.914,
+            "acc_stderr": 0.008870325962594766,
+            "acc_norm": 0.919,
+            "acc_norm_stderr": 0.008632121032139986
+        },
+        "piqa": {
+            "acc": 0.7334058759521219,
+            "acc_stderr": 0.010316749863541367,
+            "acc_norm": 0.7431991294885746,
+            "acc_norm_stderr": 0.010192864802278039
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..2443f7775c70ce2d20632ea5feea3aa16c0fef51
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.329,
+            "acc_stderr": 0.01486539538592837
+        },
+        "anli_r2": {
+            "acc": 0.351,
+            "acc_stderr": 0.015100563798316402
+        },
+        "anli_r3": {
+            "acc": 0.35,
+            "acc_stderr": 0.013774667009018558
+        },
+        "cb": {
+            "acc": 0.5357142857142857,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.48858858858858856
+        },
+        "copa": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446
+        },
+        "hellaswag": {
+            "acc": 0.40748854809798846,
+            "acc_stderr": 0.004903628887264533,
+            "acc_norm": 0.5296753634734117,
+            "acc_norm_stderr": 0.004980985384152898
+        },
+        "rte": {
+            "acc": 0.4981949458483754,
+            "acc_stderr": 0.030096267148976626
+        },
+        "winogrande": {
+            "acc": 0.5485398579321231,
+            "acc_stderr": 0.01398611030101776
+        },
+        "storycloze_2016": {
+            "acc": 0.6798503474078034,
+            "acc_stderr": 0.010788532546733105
+        },
+        "boolq": {
+            "acc": 0.5737003058103975,
+            "acc_stderr": 0.008649531625805677
+        },
+        "arc_easy": {
+            "acc": 0.6014309764309764,
+            "acc_stderr": 0.01004645540047794,
+            "acc_norm": 0.5883838383838383,
+            "acc_norm_stderr": 0.010098218646714906
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.01283552390947384,
+            "acc_norm": 0.2935153583617747,
+            "acc_norm_stderr": 0.01330725044494112
+        },
+        "sciq": {
+            "acc": 0.914,
+            "acc_stderr": 0.008870325962594766,
+            "acc_norm": 0.919,
+            "acc_norm_stderr": 0.008632121032139986
+        },
+        "piqa": {
+            "acc": 0.7334058759521219,
+            "acc_stderr": 0.010316749863541367,
+            "acc_norm": 0.7431991294885746,
+            "acc_norm_stderr": 0.010192864802278039
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_4.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..2af472265a91ab189f18c1d4a77a9c15f791f0a5
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_4.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095526
+        },
+        "anli_r2": {
+            "acc": 0.348,
+            "acc_stderr": 0.01507060460376841
+        },
+        "anli_r3": {
+            "acc": 0.3525,
+            "acc_stderr": 0.013797164918918359
+        },
+        "cb": {
+            "acc": 0.5178571428571429,
+            "acc_stderr": 0.06737697508644647,
+            "f1": 0.406816186447442
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.04560480215720684
+        },
+        "hellaswag": {
+            "acc": 0.4099780920135431,
+            "acc_stderr": 0.0049082413543102095,
+            "acc_norm": 0.5292770364469229,
+            "acc_norm_stderr": 0.00498122013588233
+        },
+        "rte": {
+            "acc": 0.48014440433212996,
+            "acc_stderr": 0.0300727231673172
+        },
+        "winogrande": {
+            "acc": 0.5548539857932123,
+            "acc_stderr": 0.013967662954355493
+        },
+        "storycloze_2016": {
+            "acc": 0.6830571886691609,
+            "acc_stderr": 0.01075965095145212
+        },
+        "boolq": {
+            "acc": 0.5636085626911315,
+            "acc_stderr": 0.008674000467432073
+        },
+        "arc_easy": {
+            "acc": 0.601010101010101,
+            "acc_stderr": 0.01004824068379876,
+            "acc_norm": 0.5841750841750841,
+            "acc_norm_stderr": 0.01011334824464787
+        },
+        "arc_challenge": {
+            "acc": 0.2790102389078498,
+            "acc_stderr": 0.013106784883601333,
+            "acc_norm": 0.29692832764505117,
+            "acc_norm_stderr": 0.013352025976725222
+        },
+        "sciq": {
+            "acc": 0.925,
+            "acc_stderr": 0.008333333333333352,
+            "acc_norm": 0.917,
+            "acc_norm_stderr": 0.00872852720607479
+        },
+        "piqa": {
+            "acc": 0.7285092491838956,
+            "acc_stderr": 0.010376251176596137,
+            "acc_norm": 0.733949945593036,
+            "acc_norm_stderr": 0.01031003926335282
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..2af472265a91ab189f18c1d4a77a9c15f791f0a5
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.325,
+            "acc_stderr": 0.014818724459095526
+        },
+        "anli_r2": {
+            "acc": 0.348,
+            "acc_stderr": 0.01507060460376841
+        },
+        "anli_r3": {
+            "acc": 0.3525,
+            "acc_stderr": 0.013797164918918359
+        },
+        "cb": {
+            "acc": 0.5178571428571429,
+            "acc_stderr": 0.06737697508644647,
+            "f1": 0.406816186447442
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.04560480215720684
+        },
+        "hellaswag": {
+            "acc": 0.4099780920135431,
+            "acc_stderr": 0.0049082413543102095,
+            "acc_norm": 0.5292770364469229,
+            "acc_norm_stderr": 0.00498122013588233
+        },
+        "rte": {
+            "acc": 0.48014440433212996,
+            "acc_stderr": 0.0300727231673172
+        },
+        "winogrande": {
+            "acc": 0.5548539857932123,
+            "acc_stderr": 0.013967662954355493
+        },
+        "storycloze_2016": {
+            "acc": 0.6830571886691609,
+            "acc_stderr": 0.01075965095145212
+        },
+        "boolq": {
+            "acc": 0.5636085626911315,
+            "acc_stderr": 0.008674000467432073
+        },
+        "arc_easy": {
+            "acc": 0.601010101010101,
+            "acc_stderr": 0.01004824068379876,
+            "acc_norm": 0.5841750841750841,
+            "acc_norm_stderr": 0.01011334824464787
+        },
+        "arc_challenge": {
+            "acc": 0.2790102389078498,
+            "acc_stderr": 0.013106784883601333,
+            "acc_norm": 0.29692832764505117,
+            "acc_norm_stderr": 0.013352025976725222
+        },
+        "sciq": {
+            "acc": 0.925,
+            "acc_stderr": 0.008333333333333352,
+            "acc_norm": 0.917,
+            "acc_norm_stderr": 0.00872852720607479
+        },
+        "piqa": {
+            "acc": 0.7285092491838956,
+            "acc_stderr": 0.010376251176596137,
+            "acc_norm": 0.733949945593036,
+            "acc_norm_stderr": 0.01031003926335282
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_5.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1ac0dac05a0061c134203244a47b423e4df86cb
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_5.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.332,
+            "acc_stderr": 0.014899597242811476
+        },
+        "anli_r2": {
+            "acc": 0.353,
+            "acc_stderr": 0.01512017260548369
+        },
+        "anli_r3": {
+            "acc": 0.3375,
+            "acc_stderr": 0.013655897185463652
+        },
+        "cb": {
+            "acc": 0.5178571428571429,
+            "acc_stderr": 0.06737697508644647,
+            "f1": 0.40404606286959227
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4084843656642103,
+            "acc_stderr": 0.0049054894940050746,
+            "acc_norm": 0.5316669986058554,
+            "acc_norm_stderr": 0.004979763862134992
+        },
+        "rte": {
+            "acc": 0.48736462093862815,
+            "acc_stderr": 0.030086851767188564
+        },
+        "winogrande": {
+            "acc": 0.5438042620363063,
+            "acc_stderr": 0.013998453610924324
+        },
+        "storycloze_2016": {
+            "acc": 0.6857295563869589,
+            "acc_stderr": 0.010735132285108171
+        },
+        "boolq": {
+            "acc": 0.5703363914373089,
+            "acc_stderr": 0.00865809540849789
+        },
+        "arc_easy": {
+            "acc": 0.6064814814814815,
+            "acc_stderr": 0.010024426884292555,
+            "acc_norm": 0.601010101010101,
+            "acc_norm_stderr": 0.010048240683798747
+        },
+        "arc_challenge": {
+            "acc": 0.28071672354948807,
+            "acc_stderr": 0.013131238126975572,
+            "acc_norm": 0.29692832764505117,
+            "acc_norm_stderr": 0.013352025976725223
+        },
+        "sciq": {
+            "acc": 0.919,
+            "acc_stderr": 0.008632121032139985,
+            "acc_norm": 0.915,
+            "acc_norm_stderr": 0.008823426366942324
+        },
+        "piqa": {
+            "acc": 0.7230685527747551,
+            "acc_stderr": 0.010440499969334535,
+            "acc_norm": 0.7257889009793254,
+            "acc_norm_stderr": 0.010408618664933384
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/evaluation/rankeval/4b284b42boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1ac0dac05a0061c134203244a47b423e4df86cb
--- /dev/null
+++ b/4b284b42boscar/evaluation/rankeval/4b284b42boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.332,
+            "acc_stderr": 0.014899597242811476
+        },
+        "anli_r2": {
+            "acc": 0.353,
+            "acc_stderr": 0.01512017260548369
+        },
+        "anli_r3": {
+            "acc": 0.3375,
+            "acc_stderr": 0.013655897185463652
+        },
+        "cb": {
+            "acc": 0.5178571428571429,
+            "acc_stderr": 0.06737697508644647,
+            "f1": 0.40404606286959227
+        },
+        "copa": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845
+        },
+        "hellaswag": {
+            "acc": 0.4084843656642103,
+            "acc_stderr": 0.0049054894940050746,
+            "acc_norm": 0.5316669986058554,
+            "acc_norm_stderr": 0.004979763862134992
+        },
+        "rte": {
+            "acc": 0.48736462093862815,
+            "acc_stderr": 0.030086851767188564
+        },
+        "winogrande": {
+            "acc": 0.5438042620363063,
+            "acc_stderr": 0.013998453610924324
+        },
+        "storycloze_2016": {
+            "acc": 0.6857295563869589,
+            "acc_stderr": 0.010735132285108171
+        },
+        "boolq": {
+            "acc": 0.5703363914373089,
+            "acc_stderr": 0.00865809540849789
+        },
+        "arc_easy": {
+            "acc": 0.6064814814814815,
+            "acc_stderr": 0.010024426884292555,
+            "acc_norm": 0.601010101010101,
+            "acc_norm_stderr": 0.010048240683798747
+        },
+        "arc_challenge": {
+            "acc": 0.28071672354948807,
+            "acc_stderr": 0.013131238126975572,
+            "acc_norm": 0.29692832764505117,
+            "acc_norm_stderr": 0.013352025976725223
+        },
+        "sciq": {
+            "acc": 0.919,
+            "acc_stderr": 0.008632121032139985,
+            "acc_norm": 0.915,
+            "acc_norm_stderr": 0.008823426366942324
+        },
+        "piqa": {
+            "acc": 0.7230685527747551,
+            "acc_stderr": 0.010440499969334535,
+            "acc_norm": 0.7257889009793254,
+            "acc_norm_stderr": 0.010408618664933384
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9eab2019028bf7ff6ddee2da6ca6d30f8ac16b35
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8469284d3f8314e5ccb84c478694a914a625f3b8e31a6b6ba04f79462972968d
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc5170f2783627b9ab219281781903004f7e322d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c048abc19985528287c78555fd84b61edf667ff4732f173da493ae02b2c25a28
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19380484ea0d9bbd80ed66f34a444a52a454a02a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0df339267a584df04ca7bf0eee078d106a4d80a99e33ce5edcb2b178715722a
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5bddd079637865ad62cf210a545bb85a56aed3e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eabc4e3eea09a2d1cc34520bc81fbdf76944ee5b2fda5a63d326f7845371bbdf
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4544876215a182b61c2dd4c4dea16a77981ed69c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0407bdd96e8a53679ff631733a5083933cb4844c68d1f158b4deb574eedba8f
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8860fc8ab6047c7aab2ddc1688def54cb0726ced
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:275f8f1c8794f3ecf7c5920ce54a12f0241e3189a457744bce3205bc32deff88
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..825ec76a56e08aca8a56760f3793f76ff1dfca99
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac100caebce8dbe2e2ff93f2e10e1ff64b94a0b58a2910d57740177b516a66bb
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..212271290ad1231736038f97918d2a375f9dba50
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0a0204095eea08b80da600c2fa18cc7447efb6bce877ab296d1e5aeec7fad62
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8dca3923c5ad50a1058cb738573a1cebe96103c9
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10eb36fc24f8cd76ffd920ccb14ffd24dff49b934eebe8da0df33fd1f559038a
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1fc3be882acad38677d1c3f5a80c33424132842a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1b834041bf4a32a77dea33f1f09be275cee9f348de745fafca1601a424cf62b
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..defce28137318247d04b91340a3e4ffce0605d3e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7221b1ce2fa0488d68ebda76e5255b701f1d3f33fa176ebecc589f941f6c7f44
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3fbb2dffc538ecf44a124d39d541160e128ca231
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8eb6f43d7224edd83c04ef8712c9da34269d4d30686496a40c602b249cf93bc
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f802faf6af982cfc65b8a31b09eacf8f25451162
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9946369432fd6b5ab65b50c99ca4936ae72974e9ad07842761b1ce65b4de4ef7
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..18c8a3c3f2b9a409297d73d316dd6212799adf69
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa22cb4d167f0b3f54d5d54ee75975b7ef429628629c1815bdfc99ac861e9adf
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..83de9e7f1e3bb10666027924c6b268d03d3588d7
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:724686105de9e564b1a6d3e82b9d2237340edc2b6c74acab13a2e17a251945fa
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..62c862ac2fb0c1cf8c7ef6f113e81f6338b2e127
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afdff205c3b0a420619c4a1f7b990862c9c2cea0982e2839007120fb6d2aed45
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2179a279353932d7f38a0d1f0ec007d195126fe0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8386150542491deb92d7e7278cb56ffb974ae4428b2fd5ba6896a2205780e62b
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..387d8037af2e1c6b3f71bd2f6dcb3dea4fef1a16
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc5edad047429d0e0c2f875440a255e3585b7cadf5e0eb835ddaf9c20648efc9
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a94555c14b3e1d8b1f7e723b22ae27392718a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55f788717fa6394f01e5945c05e432ffe46e525219cd82c899aa39924a778d51
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3c1e82d6f173d8d71d3a9068020083f6a7c6ab24
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f796ded3ad12cefef90a7c0fb93f471fc94a1178bd25fd8a04dc20412c943bba
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c2222a4ddff72b0c7a9deb34cadab0ae21d1f14
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfcc98cec92c6e76eb4f983904c4f8f8e7e29ec60309d024900f6b991db12387
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9be586f7e8cab908f05702f42596eb5323af0b77
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f967fd689326971627cb1175752e7be90f14b5f49596dffa74bf1cb86fefde5
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e3c957be609ab95945ab4f960975bc4c6206824b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2858b1c76bb77e2e83063f18e6464fd82600cc83bff6320343ef655e41ebb87
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e5b80fca7087df3a72530e3eb0597bae06b49a0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acf2256940cdbce653dd888265ff09798c362efa41c7a4704c6204a62f13aa9a
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef6fae55988321921b881f5f62a3d5882e701146
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b250310402527bf9ed7eae5b7a2a41d484c627c12cb45a851cbd58fe57443cc
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a91c36f956d000d637b8a65cf27cadd19f283197
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0edd22942d8b15112d851f381aa4ed286cce7df86675a5f1686a94d3dcb9ee3e
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eee81911a7b29a705e37c4a70042cabeb121ddbf
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a8b948f0d5823f7fc930e87ccc43df2172e67f2746b01145cca8e98ecfa12de
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25b95b9940e408af3bd8f275ae03723e95052bb1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3de61cfb5ec9af9f427092643a79bd33433f67096ff56fdf6920380c59ee6c4c
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..46e24f638bffecd118c7a24a3e261dbda950fe04
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c15993a6795f949604297a81c441d96aa35fc35f16faded32867d1be005ef9c0
+size 199058605
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd1bd031b3671f1e049144fd533af6336c4b392a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46098762386a9822cb5c781f61eb85a78e9e74aa69ec49cc6d76467e3627b743
+size 199058605
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2107eed252d33015f298f5f2c3012f08ce5dccb1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c68d652f1f117fe4ad071eb35f080594959f7d4953838b512e5f46393afbe07a
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4bfe2370a742f28559e9133ede862f22b06d6a12
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16447789485e6127d4f424658e7ac4f71dcc633366c37ffc79d0a13cd515e1bf
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8cb78c430be0c71d60b0fc77f375ba57862f11cb
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e91430cf88d527d259d24fb9172235af811c0ae7349de0493fa7446a64fd7620
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f1d88164e935fcf9fd56b0e0a019b8af07c34d61
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:424fdc1184411782a1a1959b5541d63bb756bbe58a720749a2fbd583c3c0b0a8
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2aec622bc43ab8a6d6a68a007b1007c5f6901b10
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73b1a78bc8c9c5b3849b7c27dae041f6ee2fc86d028d8fe5da51e89426ebb9a2
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b950c5566fc59b84c6490f5d001b22718dad2d1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96d86edd941cd056f776085d4f932a2158f8b4e830bc0e5f7cb372c454747772
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c73a2007711ba927d62f06eb49e55c990331cb76
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1576fe173c2cc4d5a4d7af11bf6b75a05e203169f4bcca99e4e2a54b5d5008bb
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..75321a8c12bc1722501b3b2c26bd3e3f7d46ebe4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f29a6bf408d35a149aea61512cf13c44ef5cbba0ebab71b3dfc02ffcb7a7a13
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ba84044167b6fa1afddee250556ce654366a1b5c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24d1b9d95158fcfec57034c160937f0b615baa78f127b01a8b270222b476660e
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0039b7d14dab5a0e323fbc9196cb53ec89f36f0f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9a2da5044e251bacbd70b3752ee56fdc37397098b66aeebec090cfffc351767
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e153e9bfc3f136ccbc54a2cd1e63fee4f6bbb06d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e59e96ab58a1828afb6fe0d865c636037bad258b1f130eab63226322be1346f5
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9fd7b5891571c0120a4b6b948c8df9d7bb513575
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea23c17dce9079925177395de80d4682f6dddc8ddcbad0afdc82b87d10ce2246
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1821feab51c2e873f6a4022c735b36cc3f7c18c3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7c1ac8a756664cb35b3fc5c19aef51d9755948fba6c80baa2f3e2149ccd530b
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9ea4262585bb1f2f61fb62f2d81d3347ce5ce190
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40f40798c14e0a9273cb2f93e06663820294b56493f2af118d680ec249066104
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e6dcf395b42430e7c23d596cf2fa4fb42c9d4e53
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e969493ebd7d06300a03234e40d8cda2344db49a11c63b63ad0d5fd88f5ebf28
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e45797eb51ac02ede887b0a26a25a1172f02c093
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2664a25619d4085a342ea7fa3296b8a5e10f1e5b872a58455db562254ca9351c
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cbac6d1c780327744a56382c7cf4c5eaedc62dc5
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d22c8db724327240670ddff70d0e1c4ac70fada7a47fa7f0728783afeb81e25
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0deeaec0418eac738c47e2aa708637515d0a6f09
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfe2b840b1fe2ba9316ca7363bcac18334428d1c12b1e1ba6f98cf710ead7a67
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..82b8652da0813cd9e8d0308b0c683ba247812d0e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f38d893d967ad0bd73795e244cd14402b0a8476f344008700723bea1cd66f94d
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ecc84654d137e5d8c1e33b00fa09b3f0b590ecc6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b0ef37cdbdce41ac6659ccd9fd69287b58cba0dabab381d8c47047daf60d880
+size 199058797
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa0fada9f3a6bcaf36c28ee5407c0a582065559a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2017d8c86cf3f40359f4ca02d6d1a8369791db2289ff83bce1401acc2b1ba2f
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d73f7b5416cffd151cfa47083f8403472b69512
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed6e1e0f817bf57161361cedafc973a7f51437017c78cb94ae8e5447c6ac4897
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69c4dd61b576acd1bac0239c9b7c185f5d89b94d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46f4ad858b4443d4d288024efbb7c20996843fa16238a17d5b134276ddddd92d
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcad7f378afe073435357107fedbaaea81329d18
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8529c5b5934ef68cf9787ab954058997984a22741bbede420363becd6d5124e4
+size 199058733
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ae85747fe4f349322663d694efbbd035e200261
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bdf7c652a81cd3d03ee7a4bde3a3b892fac0cc15f9836c3a1f6c5d595ee194d
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e674bf00ed0557d76500d0cf2ba5b441954cd001
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c6c996d2501721431e0023ae42cddf72308ae432485de7691ffd3a3dc26fde4
+size 199058669
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8100629496bf53eaeed93630211c762bb33bce1f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21f1371b0b90d3e21483b644a9fb0695d2adfc1e622c9fdb812c13810c65f028
+size 199058925
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8f932038882efa776642a09c4b7ac0259cfdd624
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d119315db4b64a1583062600f5278b700b3fd9c0241ed02c74ea1010f353f84
+size 199058925
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..696acf8f0f6da56758bd76254612e495db70227c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:549625ce7c0dc39126d81f4c70f44aca40a6fbab8e960f29a90eced520b51626
+size 199058605
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..727d3c50de4e5121c399020c4f34a9451691d834
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cea699874f3581e964da8ce43515c1704688255ffbdce3b4316754b9238812b0
+size 199058605
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ccdfa53dd67d1a938bed40ebc249300b519da47b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61480db3de7990d3d186e51e2daf664d003689f11b5eecada0b488d365a6fc31
+size 199058605
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..652d42dfbc2ce6e6baec57cceda90564888e66e6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a85d2ec91628a5fb89136ae9e04669c8a42470d5297d4693dd04f8f80d954e3
+size 199058605
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bf407fc6cc42fa2b2226fccbd0d62b7969458b84
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5fa2c7616c6b26c72a86fc0368c8b68e4350b1b33770f657b1f458ab4ea4291
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a3e4077832538cf1bb6d8e49e93facb4a292cd0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b2bd8ff6535e30acfb67c352e646488b2351c19e7cd8b03013200f1b7e72424
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..62342221135b706e09499fd8311aaf58960b83a5
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bee26d102da4492582a1d873d9d0d8ab7fb07d0be768c51c3ae5b73891724820
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e85d68daa7d349d6f37de12e8b8644dca17d726
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2a11b79c630f0d744ae1f80d16727f4eb26887b142045f1c669a37f36a30d88
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac573623c06e71059b421ff59dcb9f8ea4808af3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe6138b2f1488f2985047da0d08652d2c8ccb7aef318350a7dff41e159b5e247
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c82ac8e349ed1c7ce9ac38fc634fd0e84c620cfe
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa5985f2d88026d6b6f48cf662ccb8701958a53f299751c6626e9e0cafd806c
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fd0be648c7db3c6c34ef1e8f1f8c152c7bacf3b6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f430aa0e62802b1a493a31dfaa68437f9073a3df89b78112bdda5ec3f42e89f
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..68c02db009d74ee911b26233fd660934567f98c2
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5c0d6530c5e458dfb04566c88384d3e46aff1c4e5d2b0ed36166f7b8c0e1a04
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..361d96eaaa64d236a6e04170d7d745e33cfe25c1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbfe75cbf5ba661dd1dee4bfabddd3ef21178e2aa2e4e70e4a8f58f56c989ee1
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb8559effe94fc84f8eb502d498db70efe0a1d1d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3218dc7bb3869eb8a5af4e14e5a450cd5052686cfc2d8d725401b664ef97d8e6
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1812457d2191589386ebd8c0c817ae6b01948c58
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:728c9ec27bf9ad565234892d598e04754a0e6b71f4846ddc112717fea52bf19c
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..050774ceee26effb0c555d63070014e5b2432b20
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23c9794aaff67f3039ec3f7274c552495eb4b9226d410acac71fd425b9a350e5
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..93c3051d07ff76a39a57fc4aae6d1876730992af
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:546c9dac0be00dbfdddb5b126500f8b5b97ab55206630e5858352205a88f9656
+size 199058978
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcac9a9d37ae4efe5decaa607b5ff8797aa96612
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2af38540553e39a55a80eb98e675a9d4412c11c8a270647190f071a93ab2db32
+size 199058978
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b76c680af7e29a959774501f534d02b69dd4622
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fb632fc72713853879c0a41d308bf24badb967ffdb40ee574a871caf952356d
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..980badcb67f01b42c0ce1fdaba4226d45f2e12d2
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65284fa75f9fb544b1820e77d7f2810a1bdd5705ef676f274fb94d4d78c21ece
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1251cb3bb624e299f806460bce20350482e651af
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fd18c6f9a5dda4ee7de1d04a8f9a95e8df1db4839e208ecedd218f091ed40e5
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c47ab1fa6480ca123278812ec7ecb876ee9f8ef
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d5d694662184290aa4e77951ad7fa618c458c2703173675c71361884761710
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8a5082dcd85c8504b24708d7399ebb7bb00059d9
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e079c8f670c802ee08fbddaf19e0b0dce17849293b138c6a1e45274ceb61257a
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9ea6c4c1a766e90cc244105a328d930b33b765ce
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:963f33e91c36165da4deac147915067fb4bc935ebe1cc61ea491c2e5bdc936df
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e10d83c8b7cdd24350dac05b54ed4a2a2d1cf4d2
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a873c92c77e6579fd67a112d426615320af08cc40e8fb8c2102fc6aa442ad757
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4fec86e5a38abf8fa9261ac654057e29724e031
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ed015aae3cc77e91012c0edcee37f0b036c3485315aeb4992d661b1bb1033a5
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a78578ac779e446241ae5479b36c154e546a474
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39a5454157cc641be957a3bd35454ce5c6ce11bafa366fc3fa9a9db511b341a9
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca21488b4ff2966c371bdd795d70658220a41ac8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:978f8b7e8acd697c4db29dcba247a895aeb2009a1d75eb1bdfc347e46eecaf09
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..018bc57467d8e4e7a98869879ebde137d58decc5
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f49f73525342a8b6d36a4dddff5bd91e553746185d7050d6babfa96b927e6c8
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..085b735261bdf27b758fbbd498882be83da42ac6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9695c1ec1e75ec6fe74f3d2d67a13e59e384b1fab9020972c9258de4b9ae311b
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7827e9464a1deb9caaab13e111ef45d568d02574
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f56683f667799b616ed8cec931b85dd31d9626cbbdc75cb5aac687f5c93e0be9
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..495fe12a5262ad82545a48312c94330b04444328
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14c06a32cc9ea8d2b24b9950bdf0720182745b7224783d81768247b5d1a89938
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..14f435c237afa989399e657633c9a8d817f1b0b4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb33b46e3dbf276b8ace09caf9ddcd7aaa04a1f3a55cdbd89e5a0cc39bbebc6
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9a5e521cf309ae888b3e3e29b6a2f900be6915b3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4aaf0afc53dcb910978ec0358f92caa9eebb6339382f2ac7ce3fbd94229f6d4d
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e72b4bef4828505e3aa215b8a70c271dc0d55519
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1c10ba1b3b93f2f00c5a031769a0f632b9d8b43b52dceb45d83ccf4ae7a0d8a
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b8d2f882163200df2f70e294f3c3db297177882f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eeb8773a1fbf1cc80cd841d6cf19623d9b1bbaf3e539a1ec8887d43bde61104
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..da6f112687f99ecbfb3ad3410c17867cb6bfb905
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83ab601fe231e4d6d32da349a3616eaa25c06391381921ebd3cfc30d1bbf22d0
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25b9ccc94c81e66bbde52362316875f21e00a160
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55dbd2689b230bd2687e210cec69e5a721e694b0734df73763b63fea7afffc11
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..014278eaf3224b4109ddf00a4397355b4bf36446
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:650342982aa0b0ada3a8fb43907c8d36a0474f5f205fba1487d23fe02225d7c7
+size 199058594
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..04eb4947b2af11acb2ff6b4c10b151e132b61b48
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1a1713b75ae83d8626d35199de7082a90ae5ce80713a6a85bb7870b86dca38a
+size 199058594
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..870d434f2121bfe6e4c3726f0b601414223c014a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f47d36d715dabcc7e919babf55a2fe725503aa0c092997576807334269ca1e10
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1417f1494ad79648efb081830571d83c94b24cf2
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82cc553d41a16507b754f3e98322ed81a62bdc4cc6a875bc76fb60faea600ecb
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb7ce9a1b47562daba942fc4104686e79eb93e3e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87d54522985ae971d975c889616f7fc0c9313283bea2e3f444342d672601b37f
+size 199058711
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a82da335d3cba341d9aa0fa29391b2eea123c286
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f063593686047fcba448bdb72a47c8a363fb9aacd4f25a2b30e2dbd59930a556
+size 199058711
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f170cc0876209ea37e9f082da9895d92d5682b7d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88d6be7e38c37051568085454871398acfbd2612ee016a8a5b6ba61c6cf6d308
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11e38e036fc070d42bfb1d712464785833cc8369
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97ab2430d0a3213eda0cf8470a0c72cb7b0257cbf0a16673120b421fe139c1ef
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ddce25a328cbf0a39791291c74601989af14b4fa
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05579aa2f4d4b07f4c2c13cb10b7a1fc4bbced7ab22ce701d991b0c128f90508
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..956ddc0c8628b8a88a54ac28f7c5e43743d86770
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4e20e340c4e901663a372ebab51ec2c3750660954c4071e7435c21e09f3464d
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8e4b2ff7127c225cbb66209a236579658e31e5d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63b9627bad7038067df1f5487f872a082f5e6dcb682b05d9996f0921e5e73ab0
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5c2cb01f892830068caa260f2adacfce820f4ccb
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:666fe8fac4fe4042f1b1e77156bfe16886c6f3ea1360183cae7d004ebcb57532
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1ebebac4df5adcc66c06f7d0f279f72344de1a0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab719d90b3c97222f01929c7c73bbeca46137717e8c46ff1d741aa040ecb1290
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d65fe3bbcf97c01b22118935ca2b19df1b5c609b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bce0ef6ed5c0698cf175d1dfc03ee1564ef00811e145a97fa6be3bfe9208129e
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b8d23dba231af2a26b2d816cde4b31bc4618ff9
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1787b3f766d0af73d6d4eb4e43d07dc7c95e7d6158efcda275c0b6a18470d32
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de68af10a6173c7fc1ee92c141f6574a15aff197
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bf12fb90f03322f925d4973a0463bbe315ff0232bfb81394a5d503640813be5
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b60e492090aaf0f5ab40657b00efe7c3b22221e8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e57c732fa316be7d429fd1002b4fc6232b7321f9b19d59d6671108170111b7b
+size 199058594
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ad16a198643a86df8b9bb56e2e2c147f1b0f9a8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c470a67e3e52e80da31a66ce33f67b666d3bbad0f71c01a7ba02ddb8cdc5aca
+size 199058594
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a7d098f3461a505c0375ddc9e1d86990b5e5d720
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6af58cbbb25d6b581d865d9c7de40bc06f9c957da6b20330f2a3d948f872aaf
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f198b8fddc475fcffb7525a5375326f2633dd0c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a77d2017ad49dc2af7bb94c533c66509ad67ba2debf0741a5184c4545e5bb2db
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5e4e44d28e3cc2e4b591b3321a4c61ade48486ae
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffab10de9501bdb6b4334acb0b5d2f570e9d030d2b73a789fdf4f1dc22e31fbc
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f2654b98f37d5b7b79f6c38c1564bf2240f41c00
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:711e088eba818bcab13f4688fcc35a25d8d5633f82c0a34cba0432e9b35319b0
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f29e34da954b57122ad292904846311baf540fb4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e752158d6cfb1733d69a3daee34a190a25cc474a7147146b14bded9d7570837
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..643bcbee6c168a9baff9fd517515e2e02a0b31ad
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e7f3d7b94268a8d86f7e9d745b529a081c503a5a3ac4e8a308af60d91f4ae4
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d4060e766794c883ec955da03a1eab7d7662ca0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:605c8bb57d90c1b467ea4141038bfccaf9352f2924180a8948f679c75f8e7134
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b925a9daa2eb9de63ae1839122de88e8b7ccd1ac
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75909e3dee4b468fa2f359ba1335a487a96ffcb58d3a81dc7143ae5564e46ab8
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5b2d661950384c231c0986bbfb710d43b7dc28e4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fa4813e7859c25c19c96cd80f6c2327d28966a938fb02596f53010e763b9573
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9e32c8ef5e05757a817af4bea1be80085e76c89c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e420b8dc94887d178aa15c9c1b18836372d15ab3871b26752ccd35271004ddd8
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa9097e49fc19114b3ed5916b72b3f593a065292
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f200de4058ee4b53aac22cfd894998e4f1554a03f230d6b22910f7de4ffaa386
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0a085d12781a8d7fd4550656ea9d01217a011a5d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02f8481fced8fd89fae234210fbaaa29e7943ad60736ec7601575c4e64786d62
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..98400ab2ad83b97f1422673ba81fff3288aad7af
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c271be29ece6c84e2430198aa977240692133c37d862f043923016200fac0a4
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4acd96f1fe18ca61a8947340bfad96a9ba202107
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8133e6c058e5b28b38be45f94ff12468d9b038f7d940ca23df3d9622c27cae4a
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5c2df4298c468e91221943895f5970eca91cdd9b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53df1d3d144262a03b1026b224c37675980a5bf4dfc0cadb378b9810ab820048
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..929cb87a3a8586bd54e5267cb91b71183335e83f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f95ddecf20b4177bbc1010e56e46aa524471b3885eedc4a91013f2920665bd6
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8fab7cfe054c5af5fce887ac13f71ec39a1514f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9889dc3d8a6d5bc8dc7be489888bb0dc930e163a991fbfe514f8d1f7e1f379f
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..afe6aa39bf1550efe7b850597fb82354e2693bd7
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ca2da12723760ef37d008c0cd8730629dbbf10756d2bcb70e3af21ef3c927a1
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0a292a7951d922f51e056b1ca3e37d17f927ef59
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c06230012dd2bc6d591be3a88c6de7ae0268a3aa8ba0464d585671cbf5d6acb7
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92482eee44e81fa0b8b21527e2ef63e6162ccb45
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:347df7961a44a03f244915736fe18baf75146662d44bc2c779c0deb165296eb4
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5921c81d81fa32b2babc49fb1774305b8574fb03
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09f4ad5f49aec34d4ee66ceabb8a205c8b405f6390c0428937bea6f715068d86
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cefd1c94449171e34623a9ae3bb2567595a1a12c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:456515adc14941906075e97c70064576b4c632bcfc3c51bfaa203341808fc401
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f1c3d1117f6c2907e8b336ca2c141a745a7536a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6178676c203420c4fbedef6e5c179be321df1849daf291a6032a22120a3b1836
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..319a944bae77214c3b1465ab9676c831de35cf15
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:856d69de8b14495538dae48044f4d7f7a1b35b0d1cff7b57a4e77bb9f2096386
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8ace3bbc7fe8dce4225beaa58058ee8eae9ed9c5
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57b9c420b0f287604ddde30e0f21b910a5199cd18fe7e223e3d8df21538c963c
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f0eb99d55ee569a81fc4967b5a46b558da8aaca
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d859078687212bf2dbb0af1d8a30da2eacac05104bc8ba0d9b34bd94ba5ece0
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca41cd6c99ef8df13ee8d99521238064eae04aab
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d913cf85c4053b143098aee38b89135f42e017866deb5a1fb88e7079fd4cf8a
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..609638c9ac83b0bddf1cb4766c54aef537bdea86
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd4ed4db4959de8c749a235848e239b7af35bc82a3dedb8f4111c7e8a50a221
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b17daa3bb0358cf6e8d71ea91aa38fdaca8b0950
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eca1446ea2b1dffa5b4ac17e8b48d859b14a8085cc8f72b4706a617703b31158
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ad0adde6bdbaf693fb39349e81a7d8dc52053f9
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c19b3b4f52d177af20752b90542d3fddf3bea6c6497eb86afe7b7475069d86
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..904438ad4e58aae28644e9a9c1e1d1ef91387b25
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8ccb462c3edd8f8320dc5382cba89d2578ccfe2cd95c989a12d58ac78e11d39
+size 199058775
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ccb4e87dbf7b3230a88e8f03cc7d14fdb59cc2c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5b06976f11a0a63148bedd33ab56ea557df7cb8a4fb5651f2d9e14bd8250d14
+size 199058775
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a4a4d377e731685a9d543838860f444fa3dc4c7a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d019d5375b62679f7bf2cac65800ba3dbfc6df04b4fcbb39eda7c05af31000b
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..addd265fa1ff7ed796fe9411583cca736db57b08
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5776b36a36393d8ce1c8742013219ec926bdc4fc22fd25e7a9c3076239b343f5
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a383f29bdbc9174b034f0406318451d52a3f938a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d87fdc3f836089175f03804351377cc0c97622886b911d8c17a7cee22a683c06
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2515d4edaa0f56be8835dacedf509d583f0d7af3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3289bad548cdf271a2283bb9ae18165faa08afba4de1dfc24de71d3dfb1177d
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fc76e30ceac5015a13d806facaf49261d96a15d3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19da26f4c74473d41cffc7f73d07859f373152ba4e807cbe1cb81cfcaf9bcd23
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63b96433a09e68bfa32b7090d240133ba6e0e410
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be2dca72fa4c905a4e25824ef4b778e259f734b3cc55bfc74bf624ce5c5d8d89
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..03b6d687887a73859ff1d1aee1cc458c52fa0fab
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b016143101db1ccbf06f5c148bda28998fe4ca2af6ce3e215642863695115f8
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef976a0795b1a178f3f8ec4ff4a4575f2f5c823d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ce606e27e1c9bf31841f0b06605f0e6efc7d882a20dde2174cd389c1c38b391
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..85016b761eaea69172c0713ae573e16d94877c28
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32778cc98c85ef28bb16c4a3982a79417597321d72864f45320b2430704800ad
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c7ee30b63ed426356bb47409768d3db07e005505
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d71deb04d601e3942606fa50d862023e7de6a0a4bcc76b9b8f1b9717a3a22ec
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8b64f611578e936ddff9ae0891120f8787ddffbb
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:082231ee715e2c806385ec60c5e3fe2b568fbbb56cbf4e61f6617517e62565c7
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3bbb50b6d408e8027cbe36ae14dcd26275dc4ac
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4881a9397a8ca973c27853c04b9d1381427654da9184befc10b0a06f47a3e0f8
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e54701b6ff352ff746967d694b11fc92abcc5c3f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10ec18cd383238b186422e8e4c2317fd93e30bf6e23be634f929848df33b9a60
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..34ce4d86fbc2ed4350a0658871886c1266897ad6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21d3e96774f9a24e20b518fcc6d8d629cf99d09b1c1959d65ec0f6baf9822390
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a8f05a3274290a290d14f7f33251728f9552b60
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ef143cccc663d1a75eaa50c6ef5f5a44e5664874002c91c077d5cb19892c366
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c3d57bdeeef35f4ead26c84c6c07c6f819635cbe
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55df6c723cc305b8457a1e1e6d11508e806be6bf9bc2b2e517fa60d19f61583c
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f468ce792a2c2f763f52b5bcdec14ea699eae39d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01c3e35ff45729494ffc4b558b9e3229246e6004c071a7f032c7626b4900dbba
+size 199058914
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a0eaf514e58da95c4e09c38a70107b3c97ff95ba
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4602787c3199edcfe85a76c0be6f69c69573a5163908a67cece78ed942a7dd72
+size 199058914
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..78fcac2afa7f30ceb8d7dceb95c35602399f250c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8db3e4b0e07a3a4f15d9894c16917a76c26db38079f8ab8f3b6f40d552952b22
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f62a6c2b2fd82881c71d9bbe32f841acd011d623
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7a90ca626c8c0b1a0bfcda08cf37d8c73774f77d79cff873b0b107ce5048dcf
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5afbea3311daadf6457f781f2795380bb249d88e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8c31b6bff32510e4850515422b5ed7d79dc18170224feb7f594305ccfb88aa9
+size 199058711
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2a090f4790e383cb82f044f5864f9830fd5f15b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88d8f2ff6303252b2d5d2beb8b6a7698d21358887ded1281a1541aa2e9d9d3ac
+size 199058711
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1b63a1171fd0ffebf988d599fb00b5d807aef36
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2c35cc29715c848f34c0102c40ecb902234ff8b5e3d3846182767d72c7291c6
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5e821f417f7ad419adb4189e1ab6c9330c3da4a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9508e199ce6399af1713e385b89e92be31ad21c94ac4906f5480da983f998b3
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f821f8daf83f17ebd99e7edf76e35383a0e87f5e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39880c0cf7ce92d6608da2d2f228f914e7c56fd30b0ccf219e3e6dc7903832f4
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0a7f1aa96856d40737a968dace997d87a307f69
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cd8ec6931acfe3ca0384eb5f6a3fe5cd7e0bc825f986858f97614617aa21bfc
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9794da2eff98881dccaf13c14c1d65006cd45e9a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f71eabfe6986a1d2c4555603f239d7b324020fbf8d27da463fa8eee12711ce6
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..180d8393f1f4ba6fd542ae4811e9eb6477d8e780
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99661548619b0e0c8ad49da556744510c307df08c38c3edf5a3c369332be112e
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a237540bb4fb844f38108d5160a44bdeaee41332
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daac839b70331792c00e25d115b54b5ed88a4f2073734060f4d8cf4403243dcc
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..33070b7a51e2aa73c6eed170194090ea02b6f273
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fe688a1bba2adb2ddab8409f98a2e5a101bf37f9f7e70b61477068bfa96adb0
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1a13e49d22302b853a6f0c2b4b98df22f23dadab
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:464e2425ed6d1920254c0f914a82f4d4eae4187d56c6bd4829a29908a34159d1
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cdfd71f9d12581da642ad8bf16e4be95cec0ecc3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:566c150cda2171e9ba9cc32ec53672a50fe2db895d32357c563215c2bb7c612c
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1e4acc997cfd4fe4fd44c93f395c26a8bf7f3a64
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f71c11f2c2865a7313db1e7f0b69587de53a1af2cc610467dbe05c3ec3c96dda
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e6c6991cf892e884eaa3c2b172ad340fbf948ed4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d9de52d99acac3d8d705647de85d96459af86467e8316cf2052e6752ab67a2a
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2bc7260046dc83088f86ab18902aeac58fb12d02
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:768264be23c66c1290821a0b5bdcd8851f106477b862ec32ba765d24e071f698
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..495eb5eb516c232c0f17104a9a7ba9ff674c5b7f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aaf76301dd4019254eee3306488854ea7d52e8455d67c70399bfd49496be7266
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b8e67026ebc27d6d775a216d1ede7e4d25bacf64
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0584c80dcbd5eecb588efa3991d5c99a5158e85ba07326202df60e8e5b43c1e3
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef1758805af1e76905c26d60b4f939946c4620d0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b559938455a5f67d06ec0ce5cf1861be1af10077d82318f69d337905970e347
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5dfe07bdaec0d25404d9736e9123adcd86d10c4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a942709ed93f9ca073b7e70595e0d6b0ebd21286be3d4c141fac0d34248def4
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..35ef8d5c8cb48463b323ad146d0b53e28d3cbb65
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05fa16540c9700c3df085b9c148f0356e46840045a4ebdc28c18f974ce8106b2
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..93835fd5ba4160b8bf5a4e505fc21297373887ee
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d44de58a1a6a3eff17f17d37be9ae1e0bab504fa795bed3ad09e09e1468c87aa
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c195f3da0da85295da8303acd5354d24c7eeee4c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbad5adb3f3d82d0db42aad993fdac39a4a6ba1925bb72341bc4de1251117db0
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3ae285b9529509d298828cdc185ce70e415f788d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb69a9930fc79c80cbd03f465db5d599887b52054868dd32f238d2f485af8ba3
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a31f48fddd9ede3f18eca942d06758e0c7b00df8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd5cc30c16c9339b3a8b365d2f9392db23c4a58efabf84655679fe49cc95e65a
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..96128649cabe24f7fe8c53d8089ffad90054eb28
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45c99d5cba5b13d2ea0a0f59210fb62f1a62fa1e67364451d44c6f3ac20d221e
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a6131081ab2d76973ccf949c460163786e8fa135
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3150c8f626b9ca9a9a71b0320f7fa42389d092f4649230d582545987e776d594
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e812532637d8a7895b8cc7d3b961fe3edcb0cdc0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c73e4cecdc3f5985861bab7254d9ddaaf5d9f9554439a2807b42c9f40e2494c7
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3170a1caacfe702252d5b4b5695ac5a3bb3e0aee
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c27ae93b5cb5bc3b1d8b2348a1233d29ffb41d2d4ab30a9b8b498f0575e1c16
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..28c16b5d49935923b4194173a422121537159c4a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5656f829a5b0e265b99d427258986d86c8355b458844a1f3b59910b0d91791d6
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52a4cccc72e0ad630572d7d3072fcae4bfacf012
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d58425cf3e8f16108e166e311ee22166076587306a228b24806c46153091b7fa
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91a503ae1fd10ca7cd5433f7a91e7258ad6b9703
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b89d04ee804541cc81684eecdea486b3247a2b4a04db6ce47db0fb4a2d1ff319
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5cd4299b66cb3041465000c54f20cf43cf1f710
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88b0da231f93f0b39a7ecdd0b7748330de9220da546185b08fed88d93d7a91c4
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af087147c9f637f80dc265f96c5ffc6da31db21a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b84517117b9259b46f82228b0cf4f8abcfaed38c8705982424b0de0212f8b9d
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5579e0810c27ed905d4c17cf654be90d424e04e8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb291740ac2dae79867caabbb3998471ddb3acf83dee41fc3791193561eab1be
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8ddc163bd38a0d2b3701f3e0de77c1047bc5b157
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:975bfa3128db85cff7a298b5dfbcf88e60166440e1e0a67a28e2fb89fde69d21
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..269cff1004c876f135bd76b4fd0916e2c7c001d7
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2529f73bc4083aded2985cf69622c54e229ee536825fc76db54a489a653e0ff9
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe1104aa155d676c9b885b0c76c1129aa4877c30
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4abe79ffa875a5e7aa53824d1cc261136ce25a7ae5ebeaf787ec64ab6f3baa2f
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c04771bde51addfe5499600b8f4199407571a651
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f2cc2ba51e39f934a40ae5399f2f53dfe9c3c41f10413bd04f561476377dd7b
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f58b40b919930ea08490d48554fa1a0a8fc2f9f0
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08f18411b2d9a2a05717cb6fa5bd69baff22f46ed7eedd2d514c35ca6841f0d7
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88324b8f500cb3ea865e40176468c8754d1aa965
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:904355b1367c7dad9ffbe03dd3df6dcaf7ef606081b67e49dd65d582ba0ff4b5
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1a0fbf353023a51365e3aab72eba4a041cfaaeb
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29be92b2339c8196426b2111ffb8ff45d27c40a7ecab90062e49a522b98de87b
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb1c1d567ee2a16f7b260a84abf22cdf297866d8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a718912a2dfb04f3576dc4f99d660edba3e6ea82c14bc9823101c9fc586cd0e0
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2439f5ef98d945122860292352c4d3321fce27db
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a78721a407f13f53149965750c5bee3beaa5148e6b70fa074499168a32bc876b
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a4c8f102d44779abaec868926770c27a3c269cd
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d852a33e18b395647f7af0b9ec746d49d7dadf4464db9f9275594b7e02a45fc
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5b3edfbbc3caafbcd1584e02f259a29f16dc5df6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba06079f6e5e7c156ac6f8ccb0057cdfc69076061c19ff894cbc9c567bc53210
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69aa2089677ae70da9c77ba41fa1e3bb67a01ae5
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3601d417d474ccd704663c13157612db57aea548d4a42d9b33f728775c2a3324
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f19c6086d7928fc5e90376577ff3b7b999126bcb
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fadf312f38bb296d37ea00ab5af71d509c3accd6d0336418e37054918ae0d640
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65e202f1534f62c066ee36fa0c771080c0da90e4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a8ed80367479e2b1b5170ed8088a0ff0defbc5c0f885dfa1f13b0a0db3894be
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a6d1271f44f306d564721bd4e3fac4a687666a89
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94a907cca80a33606cf52c289c46bd97606c63a1bff5db2ece40a482762ce6ee
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1a51564881d9c38d638f4765b573b5859a6e432d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81e5e3caa63d54736a21472a9e29e54b74fa5cb2d8b9255f40b052312478c0a9
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db0868d9d7d6ea855108a0c97782988042494084
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7630bdc8e4f6651e04191c6bb581e372d3c5024fef4425f18f002de74bf2cf71
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2216778c4a3fde482ea5ee7ac12897802ff98da
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6f968bbceabdc412de942f68a16c0156905f0dc3525c7f75c44a5cb9979fd4
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af1c4c1611b2d0636f9736bde88b284598eb6f6b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2399310a8fb004c4571be3473b39d3390ba810ae6f20eec25a19e42f28a39738
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bf10877ae6672ab44965d1d4047e2f668d5d422d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdf750c1354f24bde1fdd3591449576e7edde7b50991c3287c497b269f5713a2
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8587dd234c178da54c96039f8f7f9d795a5df563
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1f918d16e29cb79df936502b5f30855039ac558ef695700440021ea0b12402a
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4d2c85c070b88d0cbbda9fe4c19d725891f79755
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b53523f81ed79b87de64d85498ba7bf1bb151919e55eb4de5827f52fe968779
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a6c485579205e5539458b4ed47c57acd0f749c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e94888b05273d4416650f44154665ca2bae663847fb7e58c410c539ba20528db
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..73b97a2d9b2cd327377f2d0bd9d614f2ed6873af
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:480ad60dacc55d9e281d4dad1d34d0c2f34900dc4588082d8133caa88fa4c7f8
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c847c8b505609dd4c522f7690d492415f7b2ad49
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aaeda1f3d4a5a81ba211255e91aaa47baf97864a85307a0207334700577316da
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0275219d0e87a166194d1fa6586300b1f6855c67
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ab8bff0d30070d4620cf466ff7a2f1b1deec006ee7a021f32197f52ef10a14b
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2703f68d08834e8c0a68d787946895146f41bdc7
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28376bd3be0fd54ab7d70636ce8e821342238893ec018e95cfc1114dbe56471e
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cc600bf6feb6d17ce84e96df8c886c080a8d6775
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a976731bbae2eba781a8ae1c2e46a92e756c2b1deb3ab2108793e1c8049ceed7
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0ac5e8508e9b08867ce95180748b8797d7a0781f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5a66f05f7ad76aced78784b16b65e86b6eee959546bc92ca069bd58b1b437ea
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ad7e79852ebd045d383e4b0125f8ff828b402a35
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09eeba1fb308c31b9b40096d5d9e4c5910ddc66a479557c5291e367f57b22ba8
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9d288fa141b50e4941d013bf077ce227f2200010
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8e832f2f264c5c2b56a6dfca08fa05acec2156fd774ffcd75cd5b267fa90793
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f85aae49dd3923c33d29bd8a6bec2a1f36b6fde
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66ce0ec57f5128ab3ccbff98e425f7b12877afbe33ea9e940d151f8f18708f0f
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..008975be73d2039514196fceaa62d78183936d04
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bb554be3e209fa2b620f26d0d20573a9ffde9fd371b52d2ed66d1013a271430
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3977cb4658b58fa8b3a7577b6b713a729f169563
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5ec776830cdcd3d9ca644ede0aaed0ceb050b466d7dc80d889b5dbf53f0ee8
+size 199058647
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dfdf9d46acd421edbf02be00e7455d0dbd7b67ca
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff12b0f4a7b2946a22100157f924fb665e5f3946584ea023d4ac70a17d4db5f7
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f1c51f72222a6f83990585c79e7e027dbc30932
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1aac5eb6a1bbf557713a358e521220fec84d466c3b72de2f02778a8bf2550b56
+size 199058850
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5d4c23fc9cde3cde0141ada4e1024c9cd21d0e70
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6756ede8f9e7c429dc357d4d006032ff2779d1558f1d6e5fed9ef9cfe274f811
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f5d104d97bd2783b0261ae3d2ebfe7c48c6def1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21e52d35857362f2c4a2b68d4e1ed134350a2af34e3508ccc06d9e77591acb90
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..67e22aed88e38b9e30c1d7c44bb3738567a418b4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:047e5bd7b86097a691a88ee229bf9947a74b40a94a1ac3db3908821f7bc507f3
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5335e3472632f77abd676fec3ddf16b45ac8ab4c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcc1f422a8660a862b98c3c153d33cacd93d1f1c7e9cb6eafc90919337993535
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d57679d5603931a9eea3af3ec174006387c9180a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99f676426aaaaf3920f32290255b69e1aa4ec19ade8603e63a037da9f34d3938
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bfee893e0049e79caf9ff4b7adc1501985d83978
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b8752cae9dc6a2ebbe1c6183076403f29cc8c400793ee1d2e31b0b80560f865
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f1699b2a2cbad405a72d57c348ceac024e89947e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:777c0b642d475bea9bc01bd06f11e7d9fbbb21eca3cff71c71d74ad3cbcb0eca
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4de0c567c6778fd48c7983419fc001b99cb611a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b28436fa4af9e34c462929034b883be46def9f2f70f4087c3e9beb353f14dff1
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..be823a804d45ed9f49f581840aee0b3672898b9a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d78e663f291953e3cf10fdbbad8acb6e1e9e6b1be072f1dd7758c0e65e35bee
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..909e80a956891ecff1e5d50d4e634319639db2aa
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bb3d5eb7a18ca49bed6a98e9cb9118002eefb70cb2212c097d0de5a980b2143
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69b4025f0796f6f96c7e2c0f048ef49b37b6c20e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:314437b32df2b134cb1f7ac8d9bc791cd22620faeec0c45bda3ab8cd56b8b13d
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10d3bbc88ef3b8cbeb3c722a2f495e75759eccf8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:065be24f2db0f1060f9a6d5b8f480f3cf0f7a26b18bd5027f8dbcd5a3d4c17bb
+size 199058722
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bccdc6d55e8a1e57314f0659781bc068594ef0e6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c05f7f4cc06e58dda851072999d4e59fef7ab916f031847e6893f0e0a30ed53
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aecb551546a26eee690b70ef5a3fe509b65ddd2a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:589350b9757141a104c359e9ab24aa0085db6ffef493836920b3d41f44d66dab
+size 199058786
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ceb88629425341f411b6b9454f4a9e9b4282f075
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:274f3f0463fe8a6ab12ddeb0ad30c1ae9d22871a520e002996fee15e4bd0745e
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..523dd1e51b0967ae324affd1d7c1bf55fc8cfb1c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80f7515507deb52b9009b177de3413bc6dd0d80adc4da1dfd1d60fd027dfba04
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b43a4669a29bb337b300ce0f23e0e5dee59d693
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:788f6eeef991861691659c8c7be524daace03269dc48ac09bd151754269d478a
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d303f025b3f8538302b335fac547560e02cfa082
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8f1a7d574f5836a2cf3f754cbe26789c48ee36b836e2d828b8a56d9f28d4668
+size 199058658
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1dc17e7ad45582ae0423a8a93ffb27b454818e0f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:246f9a6c39cd020a41ae9a327531ff80c5a84157aba54744edb4a4115bc8118e
+size 199058839
diff --git a/4b284b42boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2dba461f67a6c0ca073e607ea50979294d640f05
--- /dev/null
+++ b/4b284b42boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72af7ae86cdb709e868b1a57fa6bcf0b945f9afd5b6a3b2ffaa3e3b88ed6a5b1
+size 199058839
diff --git a/4b284b42boscar/global_step80108/layer_01-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_01-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4015e41c27b26b80eee066e6540f54fa4e989a4b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_01-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68192c23bb4e623cdbb1de0f13acd51c3ef6b541c8c758d3c0be4e140d53bd31
+size 167511299
diff --git a/4b284b42boscar/global_step80108/layer_01-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_01-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f0d7e27d7a6fdbfa2ec9a425511e97f6124e053
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_01-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb9614ea16cb1c8e49de4a9ceb1f313e8ddc7912b1993337f80a1f0847d07fdd
+size 167511299
diff --git a/4b284b42boscar/global_step80108/layer_03-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_03-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b00cb54bf47c74963e4f876ddb14cdd71024a1fb
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_03-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cce5c6d0a42ac47bccbc8cd0f5e458804db4ad5d442512f827ab0cd0ee6bc31
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_03-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_03-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7b2514676ec970f9d6442e61027f029877b81533
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_03-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fb87d9f6e7b7e586a24e9a578a3773371b70fb80fbc6a967381587429d8a9fb
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_04-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_04-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..764ec17abcb11043b9cfaefd53dca478f5416a84
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_04-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4e0e48fb4002a89878b4641e6e0f0a33cc678d3945afc0264999828aecbd112
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_04-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_04-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fda422480209f74e304d55e52b941f8ad7177226
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_04-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:483de90f37bef3798a5f3712734adbf0481c4f05e4d3a96de22c32a17d70aa85
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_05-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_05-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d908ed81a0caf7e9548e21b7df99d839b5d54727
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_05-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f638ccdb531ff7467f7d5a39fa89c5c9f24a977aff18267a0160878077aa6dbf
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_05-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_05-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c36fa48ca78c8a8944e620cb574f5a59abda9daf
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_05-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9818b55dffce8d3b944024cbf80f3307894e37e6b10865328e28f6188c35a9c8
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_06-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_06-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4562619f48ea488893b207a6964752ea012e2170
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_06-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54c73e08136868e4012cc5570e218f40490614446321ad678d49c5e2330aee53
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_06-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_06-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ff4160169344ba9d7149bc31cce92b735177aa66
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_06-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df42f0000318615266622ce67314f81f2d2ffca348ccfd16c82a5a4f57b14b51
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_07-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_07-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7165a6252361929c33b470fd56c21333ce4890ed
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_07-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee552f75fe5380c5b3e311a27ddfa6110334637cece4217d8853d698844b440
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_07-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_07-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..647f4f55af916df7214b1dcc57895ae960b23f99
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_07-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fc4d406f8df48bbf850474fb47aa99640fe43ad4d4f384a02394d95a3ff5950
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_08-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_08-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..36736a200ddf7730fb9d79420b42e92c9964a64e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_08-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b606b83dea0162d3672114c185366a8f83d33f914f8300e2cc13a377904e3ee
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_08-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_08-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..32f13ee6b405ce55e5fa134bc6dad43fb63d2e28
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_08-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd89de6123c3aea56544b18887fa6b2258d9845b40779a489a1d343184453e1d
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_09-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_09-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6e111cd3eb79e22bfa218dced9f0686ba26c556c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_09-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c20b09e4e56c6f2448c732648c8953926f6db60c9fb63de361209186c0897198
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_09-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_09-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4ef882b525b7b6e6a214e3df40777543a52b68eb
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_09-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2e5265f7722cefd98188b2e3d7d563b9671749aa9185d3a2b50b6486e41521b
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_10-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_10-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0a61a030df8a8a0fe348e422d3f0b4d820dbec59
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_10-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8df98c08de21dae6f8d5a6dbd20e3da6e846343a049a7c9963d936f73070599d
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_10-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_10-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9624e7b5d3c9905b8573b9d21d8e9537e60cdad
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_10-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2919203bd9dcb91054ec00a0c83151176af18afc59ee984aaac031ce58ed567f
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_11-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_11-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..656eb9496f152773defd7a45e097e50f90d4e1e3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_11-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e1630c91a717ef89c91cec95ab0b00bb385a9080dbc1497ce843873dcdafb9e
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_11-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_11-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..200e3645e4cd61ea4275ebf398ae31c1fb415173
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_11-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8f7a6f5d27e1ae2c1203851be61b19f75bebd4b33298fbfedfd5950ca11cf2e
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_12-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_12-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..12444784180c497966a6b0e832c4efafb48f5cb4
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_12-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b1d18f3114004a9e3dea3b36332c95e3e1d8130da5f361d357f2ce7e27d3a58
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_12-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_12-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49976561ee3d758fc701cde8f704262aeb8638ae
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_12-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:361c5913327b843a7730431f165434959140236ea60de877184783b0dec923f6
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_13-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_13-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef6b26efb20045fca65a42a4b5a5dbd37b400838
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_13-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:410a684026d703be24f6322a09459ee08aaa3ef24e34b4bff979887296c5385c
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_13-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_13-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fa388e768bded4cfa81a7ea45e487040ea0ed4c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_13-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c448cc733545304fcd849afe8a3527efff332f14c5d5825c728856b688e51682
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_14-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_14-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3ebd1e635b43c55286569dd94ddc27beaa6b7
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_14-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae9a11200c0bec614d773cc4399607fdc7c4d4c3b377112f9c6e041b0fbec8ac
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_14-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_14-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b17b834bfc5c92147dd4a1913b8ce50eb0da5f5d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_14-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40fd99521812f279977f879dbd2d5e29767dd9969a483d9e7f1c9f7c729d9f20
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_15-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_15-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6ff71e325f89c219b508c2047ac033a671ff04f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_15-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a05e5dd661637f8a2a58443cf72f77a0332697f56f527550039d375d6c4de9d7
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_15-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_15-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bd7033508391eb85194b42ad31db3ee99d89485f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_15-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47eb3f335e1a22c23c428acb594d3c8187834a3a3a9227d67c34f066d246b742
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_16-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_16-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63187738e5d4ff3c69b8fa902a69a7bb143077f8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_16-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f76f54f80207d9cc1050f2a9c55e5af0396ca0da883530da5614fb8cdebaf68
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_16-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_16-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..96d3fc9689abd0eaf5f9ba5ced3aca1ad838a6f6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_16-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43b85e59452249c5434cf801658be8576ab76a1e52dd9c8580de172bb774efa1
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_17-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_17-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..493b0488a8573d9d6708fa1fe350ac9cbd67fb7d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_17-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83a755deebef9fb2359faf75515bf2eb0d263584c92b46cae8244fd6cd2050ff
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_17-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_17-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5e530ff8e589dffbf405b76e537ac58156c1fe1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_17-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30dddbec147b6cbe6e8f354eba25387f8d0c060fcb19ccbaec8e674a0677002d
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_18-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_18-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4591fac054a6bfeb425dd16d1b12a9ecfe46280
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_18-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a68e436adae570547deb30bba7dfe4dbc61f5b6ffe105fa821f35510dc025695
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_18-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_18-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1272c090812c9e07716cd95992291b7169655eb5
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_18-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8da80d5506ecc114ac60478d0a055616fff370769a83719f66ef4002de1df7f0
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_19-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_19-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4252093a1a176b57647b663080eb5a73c4f1dc2
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_19-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06a15d1211b3371f42242cd6856e5dfe77ce6e357a220af9bfa7b5cdc3a981c4
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_19-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_19-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ee94a06643af6d97e3cc4d4c6b9e71ec4a4872f1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_19-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d4d704bc14baa6a36dc4fef6100c6ccd1af5350aa7ae1c729d98a9a2aebaea5
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_20-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_20-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9fc3db2dc64bda78d84821f57d17bd137b6c1fb6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_20-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:322c36fa67abf3145d596e2dac5554385940364a4e4082824efef17a471e5e19
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_20-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_20-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cba818222b7d5cf58a18dc26717dc53b847c9fc5
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_20-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28105a2198294ffd61abf977e8dd5512ff129f3563ca13191bf2d422c0838252
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_21-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_21-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f27b49a2b54e03194c2f85b78f5b0b60eaa9c732
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_21-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a653b6b998ef2b33dc33e5a4971f6ac46fd7e5e30d9d74b7d5220608a51bd1d
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_21-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_21-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb4b6c67e5e865b1c46d3b1e075abd76cb58d72a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_21-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4453cd76227490eeeafc2b839a63d45da580cc6e116c6f3b900d7aa518280ec
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_22-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_22-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fc31d58781b9e333c1e14d2c50f55fc12e18ff2
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_22-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41698d3bc0031d6312ed4c82069ebea244acd73b8f77c65f75b73ba86d0f766f
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_22-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_22-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d9e59ce8e7d3730fb229a4d23759533f54ba1332
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_22-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d28ffcf213c81eabc4101c9a8365807eeb43e97159c1853ed6dd0f40eeb777
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_23-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_23-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1140ecd99afa44e80072c0e14cde8915df097cb9
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_23-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50689cb14c5ae1e0cbf005525bcf65255829df03f41b255106d9185c1e217272
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_23-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_23-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9cf8d97c9b9dd8f1b0b2080c1dcc859387692ebf
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_23-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87b5dd0f81171c93136a0d29bfebca2cea460d17e1b88649c00cf17614fd3e07
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_24-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_24-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..230e91c3f154bfa23a516209c0d11014a193f8ab
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_24-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d7eaabb7221dcd28277178cc88d1d1ba3dd34136328d503e949cae79b35ac43
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_24-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_24-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..744679abba6450f76502eaadb8f06e5f802b1a6b
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_24-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9b6125486d0346502389b0132a628fa97c68fd07bd9e81ea6608e6359b405e2
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_25-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_25-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..668bef6b9029a3e20d420f56ae6afcd78fa54231
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_25-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f991dc10d8461f3454bd5d542a3a64e916eddb7913b32e34715a5e1e71bb7f0
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_25-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_25-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f832aceaed43f9b32c8c7d9fdbe0e4dc0b9cfce8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_25-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6671f03dfca8416c24726108b3ace636ca60e08bf2835a9a55d74431f346cf8
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_26-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_26-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..35d8030d7d621670bbe92289635196bd2cbf7f69
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_26-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:707f56867235e47832ad3d5313ff9d42147c80ab80b12c3c189efc35d318a91a
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_26-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_26-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..debfb44d30e85d8f4d13dc4a101c06a5b6cafe37
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_26-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83a13e1c823b023ca4e51d44d3734cc8c8dd972927b30ee80c1b8897c9b6d68c
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_27-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_27-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01488308b924824992db21b368ca90c05a02781f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_27-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:423ec7040d4e3082ec64326067e2b0aea2c35af0dd4c35e3db92340e54624ce5
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_27-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_27-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f9076917d960d0cab2722e84884ed15336c9a8ea
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_27-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c90395c8914283a2c2a88c866611ebdf54e90871d195ea412bc47dc20005a7f3
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_28-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_28-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..55125db7c5b61774019ea10ad316993d592ea88a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_28-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:978efedeea205385d00862ffa78fc0726e4683f2b701fcbff967bb4a7775a945
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_28-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_28-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..909b8b7b9a2094b780205cc383a27325557a7d2e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_28-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:821813c01a08dc7b97b6fe91bbe413f51ac3b502087cb645d16a5b08793dab29
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_29-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_29-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..18a0284915573a1cc8cbd186aa1090e429b41953
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_29-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:345a0b93f2bf5156fda02581c5cd23074ba54388923061318af9399fc5329ade
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_29-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_29-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b0203ec270e1d7f1b48f4a4739ae671b0ccd1750
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_29-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51d53ea8dcf0b49fecbaf1b9e3f893225752846cae147de904be976d4908f399
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_30-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_30-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..82d2ecd13f24f2253e3f571cf26dbfc7d98994d3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_30-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75fe5bcfc9170842a2058201719c0deb19e58b45d8dedf5f102f18eb35e88ad3
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_30-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_30-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d93c060365435b50a451815b607184fc5f60482e
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_30-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb35bb1fb89731975065dbe95f12df34c76003cf794e71b398bef59581267073
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_31-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_31-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..117461a8b3afd635056a9da887fd5b57b3075215
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_31-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:654d177ee7383de59ba600e62f740c6dbb184a0051fb6898ed64c5c6f857e5f9
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_31-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_31-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..810218a0b4f5e2707d5e3a3e47c56a711f1b9ad9
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_31-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67a1496fff0946aaf5e769e289ce64319f1f0b8ca4952662031e8efcc2805903
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_32-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_32-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c25e578e9dc6aebe96e06dc3eb4944108354d32c
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_32-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af638061107dece4d6ae9fcf3871fafa90520a3aff8c592bc848e541652bbe8d
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_32-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_32-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..780fe3ca976f69123af9601b4defbce846692d98
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_32-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a3ebda96d8de7522b029f6e09f70e014e9629b468ba7f48aef3cbbb17073509
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_33-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_33-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f2d6ac9d2b2a1fb4a36c1bcf3b50ab8cd41c45a
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_33-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d2dfe1a44de96b52c3f98509c29d510bd907654325d008b6621ed1bdac5fa42
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_33-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_33-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2be9fb6a0e94211e9ae681b8b25f2078a36dcff6
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_33-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4e53ca3d9ffa84754e57421fdcb4fa8414fa925cf17409604f23461692b8e7
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_34-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_34-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19b45e35c8012f2019f92d08dc5af33044f4012f
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_34-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aef9fa0acda09b6821b390b0848020d28e3bae0ecbcf9c3d94178bf8afe827a2
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_34-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_34-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5001848f9bf85bc9e3e4cf9e44dcea8adf9425d
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_34-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:884bc265dfdac4a1c1c75412ccea9bc1312a2fdf3ba202b71d84389e10581b7c
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_35-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_35-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1fc852b324ed176e4437d7d0e5e11abccef0098
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_35-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae15e0d457ba349bd8a172849b69f19b91bd68ba3e88d7972469afabf4569979
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_35-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_35-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..59df57134bc755d11d2abbb3427873855aac7532
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_35-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8167348594b762d7f29980ceb043726fcc60d48a2b6211d436e8e0a889f03c9f
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_36-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_36-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5c0106f71a51a200386665b18b251be170198291
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_36-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c160d160afd055e842efe13570ed304852d01472a0b4b5507409915098112d4f
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_36-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_36-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5dd72457644617cfd02875431c8dc39da8ac6d8
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_36-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80875841d9a534cab7835c76146c84829e2664d3c3c16a2b8df8a89fd534969f
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_37-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_37-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9260e358623f1b212eb56a268a031ee4e4150c51
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_37-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed75696fa088cec78baa6c735ac22f7a34366f9505ec5f4a969ec9fc7e39e9c
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_37-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_37-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c77dc223579110af1f2048ad64c5c556cafc121
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_37-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd10982db56ed55c62026b19f829bcbfd89c42041c6fd1339f8cd30ea0ee6b78
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_38-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_38-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c11f23966ffc5f1f3f4251c5c94a33571422bb81
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_38-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37af1776218644f64d85692055c9db98024194cb4cf7a4af48e4843831d1a0d8
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_38-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_38-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b963651112061aa5c21ad2d4eeacaa9ca0b0b6e3
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_38-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c92e3d5adbfcd371b7236af7d0d892b17351293eec155e604737273e82cb2b9f
+size 113308931
diff --git a/4b284b42boscar/global_step80108/layer_40-model_00-model_states.pt b/4b284b42boscar/global_step80108/layer_40-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e9f062bfd429e6fb315344908a862ae5d0bf37f9
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_40-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60261295a1fee7768551ead509e1c4c643f6d74066ca6931fbb64e12f462bcdf
+size 13507
diff --git a/4b284b42boscar/global_step80108/layer_40-model_01-model_states.pt b/4b284b42boscar/global_step80108/layer_40-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..195ffed72b28653b2c90c832f1177bbd6112faf1
--- /dev/null
+++ b/4b284b42boscar/global_step80108/layer_40-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d147ae313224a7b3f5c23728c52c196a607651338633cd1e0fef716fcc1a6db
+size 13507
diff --git a/4b284b42boscar/global_step80108/mp_rank_00_model_states.pt b/4b284b42boscar/global_step80108/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1bf8b8652bb21126ef8fdcb53c3dc55f08465301
--- /dev/null
+++ b/4b284b42boscar/global_step80108/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1be974ff308d927592ad8cece5c64570268e011a4237c78ceb9781df4bd3f99
+size 51635
diff --git a/4b284b42boscar/global_step80108/mp_rank_01_model_states.pt b/4b284b42boscar/global_step80108/mp_rank_01_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19cf941dfda1ca1ecdba002c08bac295e1d32c35
--- /dev/null
+++ b/4b284b42boscar/global_step80108/mp_rank_01_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33a783aa064ec209d2a864cbcd538d184c27f427d48f114134a815e894134136
+size 51635
diff --git a/4b284b42boscar/transformers/config.json b/4b284b42boscar/transformers/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba
--- /dev/null
+++ b/4b284b42boscar/transformers/config.json
@@ -0,0 +1 @@
+{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"}
\ No newline at end of file
diff --git a/4b284b42boscar/transformers/pytorch_model.bin b/4b284b42boscar/transformers/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..04ac29c595853b08a555fcb3e3c04f3dc98218cd
--- /dev/null
+++ b/4b284b42boscar/transformers/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ebb363d7f9ac7b5c0b8b8f6fa696c9083f725f30572d505b542f05ffb779a8b
+size 8781203669
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fec81a9abdf89d9bc8abd5ea9d72059823e5da32
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:337805e7d33eb0b9ef7f91ad0b53016e978e597da3b1ac9c5fdbbd9bce719774
+size 3998442
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0dea782e6918b359cb852b15e58a561965377fb
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8956e5fa362b221fd548e74792c554e8d6ca869449571ae037cebc7e70a36028
+size 4817355
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1f9136d1961cdb3504e72e8aba13da9820dd3ce0
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40926070e93157a677c79e557b30c170ad7d35fb97279b70be25b9f3217fc11a
+size 5651813
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..397395b902dc0fb9a6b2aad879fe0352189a171a
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9aea0974ed42d75b03d16097c1ff049db6b89656619e7dc4762a272438f24ec0
+size 6519117
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..524156c2ee95dccd21aa381f0973a6b57da9a5c0
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40499c72539b64fa7de985199cbbda348a1726c072794915ce73326f2d909226
+size 7404977
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..05b18e7f668fb7efd85e98ca8d41fc6de03dd02d
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6978dfbc5c7ae337c68aa220d728655942bbcaa6a7774ec07f799c7b4136c24f
+size 8274160
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3af91a65226843c27a6ec367cecf36b59cd72293
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fb5a7142e643238ec1eb33156051cd3a3b97b22512d4bfacdefc2c4269c2daa
+size 7545259
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1e780693bf495ab001faa28f53e0bff2d061cf96
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da0ae2ddb41adbec99054938e86fa508f71ced02756a08e1a42ff30ee1e76c18
+size 13004724
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7fba5cbfc83ebf5e986a80b65c08995bc5a91edf
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ce46d505823d2879e998e8e4663274f625143720b26f0db6a27a5205f951662
+size 18526370
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..23d7f2a3116cfac65bad72e0cdfa23c52883ff00
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e197e29330fb505aaa85a300f7797e9d15018057701c3110b3432cb0710accb5
+size 23984004
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..16957afcd8203b4892d488fbea58ec0509adfe23
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:223d4cb302aa6a25f5787c62a5664e8dcb38aa296efd3573d3458225c14236f4
+size 29353492
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0df5f573bec5dfa174d92e1b3aacc22c29234870
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:657a675e0110248e250ab43aff6531e0051a2ced6b60ea87ea2f45b55e3c60b1
+size 34780633
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c4282f86ddfb215a6c55ecf1d676f82982986071
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff57fc26e7775ba8695e1682292a7104327be242f2fd607a8b860f88f0fbdf38
+size 4252706
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d699582887d4b04677db17d58c22b51bcd3dea4c
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:546017d1583f7dfeac283ac2fb3dbaa27007197e6cd4596778d124c6622062e7
+size 5006446
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06a440dbf011ca862ab3bfd4dbed69c5549eb4dd
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a69d04f883d0691dea74562a335c3ae152ff2d31697afe6c19ae86ae7fca1f5
+size 6090766
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02458dd5ba8a2220926fc2909d2a7de56db87105
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:525617eb943d64ee5c1a69e028ed3035e4a5067d25f151a9c652be3783dee63a
+size 7172815
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..924238ef8b20a008bec238d648a0e672167ddafb
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e565de1ac84519b74966c5fb7d02e8708ca727ba4eee3b9b949231d796ba244
+size 8251331
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4cb9090e2ef46354904a96bb19d43b179fa32763
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4cea67e3b0d9b2ef9d45735780f3f99a355b203d59dacbd500aff35aef6c09d
+size 9342424
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_0.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_0.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a72c67dc8ee8750c5ef1ea29f215ab04f9138d8
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d27ac9fc60f2ebc579313a44dabe5c80713ec2e2b3cc59f0eaa595e1580863d0
+size 2815254
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_1.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b1e93e7b88e0d17f995cef2b652a0a1365cbb2a
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ada65c331a25b2e7f00e02624b4f9538018e5bfe64eb5b1efbcd1fa7663b2c8
+size 4952495
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_2.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_2.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fb9ce557a39fd7575f322ff08cb7ec20f321edad
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fdbb29e7fa27f19ff2cdc86c0474fbe1c92ecd7664cad56ce47ea7579c7bd05
+size 7211366
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_3.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_3.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cdedf262b82dbc9862d2c7e65d6715a52a6e8f67
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:743e12a2e4593e1750b50cfbd33d308e4ca1c0fb88c79878ef7c50c9f367fbfc
+size 9483317
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_4.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_4.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..206ec095459904157b082312564c307a0bb743d7
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c9536372f37709d4330566a2c4263ef1e592ac7c9fca9c72872e456bf8b0c07
+size 11632380
diff --git a/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_5.jsonl b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_5.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9884c322e626f842d3746cee585273022ce2d708
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/examples.4b284b84boscar_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5adde5cb2800352b8f93d43ee0d98b9bce576a4366e87bd1329f435155958261
+size 13897214
diff --git a/4b284b84boscar/evaluation/generation/merged.csv b/4b284b84boscar/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..8d4ad2a8389c889b720cdeadb760a63ce6875229
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/merged.csv
@@ -0,0 +1 @@
+dataset,fewshots,prompt,metric,value
diff --git a/4b284b84boscar/evaluation/generation/merged.json b/4b284b84boscar/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/4b284b84boscar/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_0.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdc302251146d63b213d76c36ac841954e0f10ef
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_0.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.323,
+            "acc_stderr": 0.01479492784334864
+        },
+        "anli_r2": {
+            "acc": 0.326,
+            "acc_stderr": 0.014830507204541031
+        },
+        "anli_r3": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.013613950010225596
+        },
+        "cb": {
+            "acc": 0.5,
+            "acc_stderr": 0.06741998624632421,
+            "f1": 0.3323383084577114
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.0446196043338474
+        },
+        "hellaswag": {
+            "acc": 0.4069906393148775,
+            "acc_stderr": 0.0049026907650664255,
+            "acc_norm": 0.5218084047002589,
+            "acc_norm_stderr": 0.004985032806802434
+        },
+        "rte": {
+            "acc": 0.5234657039711191,
+            "acc_stderr": 0.03006330041190266
+        },
+        "winogrande": {
+            "acc": 0.5406471981057617,
+            "acc_stderr": 0.014005973823825136
+        },
+        "storycloze_2016": {
+            "acc": 0.6798503474078034,
+            "acc_stderr": 0.010788532546733108
+        },
+        "boolq": {
+            "acc": 0.5978593272171254,
+            "acc_stderr": 0.008575926383211254
+        },
+        "arc_easy": {
+            "acc": 0.5677609427609428,
+            "acc_stderr": 0.010165130379698743,
+            "acc_norm": 0.5109427609427609,
+            "acc_norm_stderr": 0.010257326131172867
+        },
+        "arc_challenge": {
+            "acc": 0.26535836177474403,
+            "acc_stderr": 0.012902554762313962,
+            "acc_norm": 0.2832764505119454,
+            "acc_norm_stderr": 0.013167478735134575
+        },
+        "sciq": {
+            "acc": 0.857,
+            "acc_stderr": 0.01107581480856704,
+            "acc_norm": 0.788,
+            "acc_norm_stderr": 0.01293148186493805
+        },
+        "piqa": {
+            "acc": 0.7257889009793254,
+            "acc_stderr": 0.010408618664933382,
+            "acc_norm": 0.7426550598476604,
+            "acc_norm_stderr": 0.01019992106479251
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdc302251146d63b213d76c36ac841954e0f10ef
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_0_lm-eval_global_step80108_2023-01-30-19-47-04_0shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.323,
+            "acc_stderr": 0.01479492784334864
+        },
+        "anli_r2": {
+            "acc": 0.326,
+            "acc_stderr": 0.014830507204541031
+        },
+        "anli_r3": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.013613950010225596
+        },
+        "cb": {
+            "acc": 0.5,
+            "acc_stderr": 0.06741998624632421,
+            "f1": 0.3323383084577114
+        },
+        "copa": {
+            "acc": 0.73,
+            "acc_stderr": 0.0446196043338474
+        },
+        "hellaswag": {
+            "acc": 0.4069906393148775,
+            "acc_stderr": 0.0049026907650664255,
+            "acc_norm": 0.5218084047002589,
+            "acc_norm_stderr": 0.004985032806802434
+        },
+        "rte": {
+            "acc": 0.5234657039711191,
+            "acc_stderr": 0.03006330041190266
+        },
+        "winogrande": {
+            "acc": 0.5406471981057617,
+            "acc_stderr": 0.014005973823825136
+        },
+        "storycloze_2016": {
+            "acc": 0.6798503474078034,
+            "acc_stderr": 0.010788532546733108
+        },
+        "boolq": {
+            "acc": 0.5978593272171254,
+            "acc_stderr": 0.008575926383211254
+        },
+        "arc_easy": {
+            "acc": 0.5677609427609428,
+            "acc_stderr": 0.010165130379698743,
+            "acc_norm": 0.5109427609427609,
+            "acc_norm_stderr": 0.010257326131172867
+        },
+        "arc_challenge": {
+            "acc": 0.26535836177474403,
+            "acc_stderr": 0.012902554762313962,
+            "acc_norm": 0.2832764505119454,
+            "acc_norm_stderr": 0.013167478735134575
+        },
+        "sciq": {
+            "acc": 0.857,
+            "acc_stderr": 0.01107581480856704,
+            "acc_norm": 0.788,
+            "acc_norm_stderr": 0.01293148186493805
+        },
+        "piqa": {
+            "acc": 0.7257889009793254,
+            "acc_stderr": 0.010408618664933382,
+            "acc_norm": 0.7426550598476604,
+            "acc_norm_stderr": 0.01019992106479251
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_1.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3d85a8afff68ecf175cb31bd94db2fb6c6abd38
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_1.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.32,
+            "acc_stderr": 0.01475865230357488
+        },
+        "anli_r2": {
+            "acc": 0.329,
+            "acc_stderr": 0.014865395385928367
+        },
+        "anli_r3": {
+            "acc": 0.3433333333333333,
+            "acc_stderr": 0.01371263383046586
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942398,
+            "f1": 0.28905472636815915
+        },
+        "copa": {
+            "acc": 0.69,
+            "acc_stderr": 0.04648231987117316
+        },
+        "hellaswag": {
+            "acc": 0.40659231228838877,
+            "acc_stderr": 0.004901936511546117,
+            "acc_norm": 0.5297749452300339,
+            "acc_norm_stderr": 0.0049809261987989835
+        },
+        "rte": {
+            "acc": 0.5090252707581228,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5509076558800315,
+            "acc_stderr": 0.013979459389140842
+        },
+        "storycloze_2016": {
+            "acc": 0.6702298236237306,
+            "acc_stderr": 0.010871682471395132
+        },
+        "boolq": {
+            "acc": 0.6033639143730887,
+            "acc_stderr": 0.008556148582031995
+        },
+        "arc_easy": {
+            "acc": 0.6005892255892256,
+            "acc_stderr": 0.010050018228742127,
+            "acc_norm": 0.5740740740740741,
+            "acc_norm_stderr": 0.010146568651002257
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.012835523909473841,
+            "acc_norm": 0.3046075085324232,
+            "acc_norm_stderr": 0.01344952210993249
+        },
+        "sciq": {
+            "acc": 0.907,
+            "acc_stderr": 0.009188875634996697,
+            "acc_norm": 0.898,
+            "acc_norm_stderr": 0.009575368801653892
+        },
+        "piqa": {
+            "acc": 0.7181719260065288,
+            "acc_stderr": 0.010496675231258164,
+            "acc_norm": 0.7372143634385201,
+            "acc_norm_stderr": 0.010269354068140779
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3d85a8afff68ecf175cb31bd94db2fb6c6abd38
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_1_lm-eval_global_step80108_2023-01-30-19-47-04_1shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.32,
+            "acc_stderr": 0.01475865230357488
+        },
+        "anli_r2": {
+            "acc": 0.329,
+            "acc_stderr": 0.014865395385928367
+        },
+        "anli_r3": {
+            "acc": 0.3433333333333333,
+            "acc_stderr": 0.01371263383046586
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942398,
+            "f1": 0.28905472636815915
+        },
+        "copa": {
+            "acc": 0.69,
+            "acc_stderr": 0.04648231987117316
+        },
+        "hellaswag": {
+            "acc": 0.40659231228838877,
+            "acc_stderr": 0.004901936511546117,
+            "acc_norm": 0.5297749452300339,
+            "acc_norm_stderr": 0.0049809261987989835
+        },
+        "rte": {
+            "acc": 0.5090252707581228,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5509076558800315,
+            "acc_stderr": 0.013979459389140842
+        },
+        "storycloze_2016": {
+            "acc": 0.6702298236237306,
+            "acc_stderr": 0.010871682471395132
+        },
+        "boolq": {
+            "acc": 0.6033639143730887,
+            "acc_stderr": 0.008556148582031995
+        },
+        "arc_easy": {
+            "acc": 0.6005892255892256,
+            "acc_stderr": 0.010050018228742127,
+            "acc_norm": 0.5740740740740741,
+            "acc_norm_stderr": 0.010146568651002257
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.012835523909473841,
+            "acc_norm": 0.3046075085324232,
+            "acc_norm_stderr": 0.01344952210993249
+        },
+        "sciq": {
+            "acc": 0.907,
+            "acc_stderr": 0.009188875634996697,
+            "acc_norm": 0.898,
+            "acc_norm_stderr": 0.009575368801653892
+        },
+        "piqa": {
+            "acc": 0.7181719260065288,
+            "acc_stderr": 0.010496675231258164,
+            "acc_norm": 0.7372143634385201,
+            "acc_norm_stderr": 0.010269354068140779
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_2.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b67965a30d132628ea14c01fa61a4925d35a9c7
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_2.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.315,
+            "acc_stderr": 0.014696631960792508
+        },
+        "anli_r2": {
+            "acc": 0.343,
+            "acc_stderr": 0.015019206922356951
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.013596836729485168
+        },
+        "cb": {
+            "acc": 0.5,
+            "acc_stderr": 0.06741998624632421,
+            "f1": 0.3416488477072939
+        },
+        "copa": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621504
+        },
+        "hellaswag": {
+            "acc": 0.40659231228838877,
+            "acc_stderr": 0.004901936511546122,
+            "acc_norm": 0.5274845648277235,
+            "acc_norm_stderr": 0.004982237133409149
+        },
+        "rte": {
+            "acc": 0.4657039711191336,
+            "acc_stderr": 0.030025579819366426
+        },
+        "winogrande": {
+            "acc": 0.5556432517758485,
+            "acc_stderr": 0.013965196769083555
+        },
+        "storycloze_2016": {
+            "acc": 0.6734366648850882,
+            "acc_stderr": 0.010844543793668893
+        },
+        "boolq": {
+            "acc": 0.6061162079510704,
+            "acc_stderr": 0.00854583579261498
+        },
+        "arc_easy": {
+            "acc": 0.6005892255892256,
+            "acc_stderr": 0.010050018228742127,
+            "acc_norm": 0.5875420875420876,
+            "acc_norm_stderr": 0.010101305447864764
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.012835523909473841,
+            "acc_norm": 0.3054607508532423,
+            "acc_norm_stderr": 0.013460080478002505
+        },
+        "sciq": {
+            "acc": 0.915,
+            "acc_stderr": 0.008823426366942328,
+            "acc_norm": 0.911,
+            "acc_norm_stderr": 0.009008893392651532
+        },
+        "piqa": {
+            "acc": 0.721436343852013,
+            "acc_stderr": 0.010459397235965175,
+            "acc_norm": 0.7252448313384113,
+            "acc_norm_stderr": 0.010415033676676051
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b67965a30d132628ea14c01fa61a4925d35a9c7
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_2_lm-eval_global_step80108_2023-01-30-19-47-04_2shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.315,
+            "acc_stderr": 0.014696631960792508
+        },
+        "anli_r2": {
+            "acc": 0.343,
+            "acc_stderr": 0.015019206922356951
+        },
+        "anli_r3": {
+            "acc": 0.33166666666666667,
+            "acc_stderr": 0.013596836729485168
+        },
+        "cb": {
+            "acc": 0.5,
+            "acc_stderr": 0.06741998624632421,
+            "f1": 0.3416488477072939
+        },
+        "copa": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621504
+        },
+        "hellaswag": {
+            "acc": 0.40659231228838877,
+            "acc_stderr": 0.004901936511546122,
+            "acc_norm": 0.5274845648277235,
+            "acc_norm_stderr": 0.004982237133409149
+        },
+        "rte": {
+            "acc": 0.4657039711191336,
+            "acc_stderr": 0.030025579819366426
+        },
+        "winogrande": {
+            "acc": 0.5556432517758485,
+            "acc_stderr": 0.013965196769083555
+        },
+        "storycloze_2016": {
+            "acc": 0.6734366648850882,
+            "acc_stderr": 0.010844543793668893
+        },
+        "boolq": {
+            "acc": 0.6061162079510704,
+            "acc_stderr": 0.00854583579261498
+        },
+        "arc_easy": {
+            "acc": 0.6005892255892256,
+            "acc_stderr": 0.010050018228742127,
+            "acc_norm": 0.5875420875420876,
+            "acc_norm_stderr": 0.010101305447864764
+        },
+        "arc_challenge": {
+            "acc": 0.26109215017064846,
+            "acc_stderr": 0.012835523909473841,
+            "acc_norm": 0.3054607508532423,
+            "acc_norm_stderr": 0.013460080478002505
+        },
+        "sciq": {
+            "acc": 0.915,
+            "acc_stderr": 0.008823426366942328,
+            "acc_norm": 0.911,
+            "acc_norm_stderr": 0.009008893392651532
+        },
+        "piqa": {
+            "acc": 0.721436343852013,
+            "acc_stderr": 0.010459397235965175,
+            "acc_norm": 0.7252448313384113,
+            "acc_norm_stderr": 0.010415033676676051
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_3.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..572e0b71fafc008bce43d68987a8be3795379a08
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_3.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.314,
+            "acc_stderr": 0.014683991951087976
+        },
+        "anli_r2": {
+            "acc": 0.348,
+            "acc_stderr": 0.01507060460376841
+        },
+        "anli_r3": {
+            "acc": 0.3466666666666667,
+            "acc_stderr": 0.013744022550571952
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.0672477765493766,
+            "f1": 0.4123643651945539
+        },
+        "copa": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621504
+        },
+        "hellaswag": {
+            "acc": 0.4064927305317666,
+            "acc_stderr": 0.0049017474263317465,
+            "acc_norm": 0.530372435769767,
+            "acc_norm_stderr": 0.0049805669077904536
+        },
+        "rte": {
+            "acc": 0.48014440433212996,
+            "acc_stderr": 0.0300727231673172
+        },
+        "winogrande": {
+            "acc": 0.5501183898973955,
+            "acc_stderr": 0.013981711904049733
+        },
+        "storycloze_2016": {
+            "acc": 0.6745056119722074,
+            "acc_stderr": 0.010835369677013443
+        },
+        "boolq": {
+            "acc": 0.6030581039755352,
+            "acc_stderr": 0.00855727696467514
+        },
+        "arc_easy": {
+            "acc": 0.6127946127946128,
+            "acc_stderr": 0.00999531206589035,
+            "acc_norm": 0.5938552188552189,
+            "acc_norm_stderr": 0.010077409815364055
+        },
+        "arc_challenge": {
+            "acc": 0.26621160409556316,
+            "acc_stderr": 0.0129157747815232,
+            "acc_norm": 0.295221843003413,
+            "acc_norm_stderr": 0.013329750293382316
+        },
+        "sciq": {
+            "acc": 0.914,
+            "acc_stderr": 0.008870325962594766,
+            "acc_norm": 0.917,
+            "acc_norm_stderr": 0.008728527206074792
+        },
+        "piqa": {
+            "acc": 0.7252448313384113,
+            "acc_stderr": 0.010415033676676042,
+            "acc_norm": 0.735038084874864,
+            "acc_norm_stderr": 0.01029655799331605
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..572e0b71fafc008bce43d68987a8be3795379a08
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_3_lm-eval_global_step80108_2023-01-30-19-47-04_3shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.314,
+            "acc_stderr": 0.014683991951087976
+        },
+        "anli_r2": {
+            "acc": 0.348,
+            "acc_stderr": 0.01507060460376841
+        },
+        "anli_r3": {
+            "acc": 0.3466666666666667,
+            "acc_stderr": 0.013744022550571952
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.0672477765493766,
+            "f1": 0.4123643651945539
+        },
+        "copa": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621504
+        },
+        "hellaswag": {
+            "acc": 0.4064927305317666,
+            "acc_stderr": 0.0049017474263317465,
+            "acc_norm": 0.530372435769767,
+            "acc_norm_stderr": 0.0049805669077904536
+        },
+        "rte": {
+            "acc": 0.48014440433212996,
+            "acc_stderr": 0.0300727231673172
+        },
+        "winogrande": {
+            "acc": 0.5501183898973955,
+            "acc_stderr": 0.013981711904049733
+        },
+        "storycloze_2016": {
+            "acc": 0.6745056119722074,
+            "acc_stderr": 0.010835369677013443
+        },
+        "boolq": {
+            "acc": 0.6030581039755352,
+            "acc_stderr": 0.00855727696467514
+        },
+        "arc_easy": {
+            "acc": 0.6127946127946128,
+            "acc_stderr": 0.00999531206589035,
+            "acc_norm": 0.5938552188552189,
+            "acc_norm_stderr": 0.010077409815364055
+        },
+        "arc_challenge": {
+            "acc": 0.26621160409556316,
+            "acc_stderr": 0.0129157747815232,
+            "acc_norm": 0.295221843003413,
+            "acc_norm_stderr": 0.013329750293382316
+        },
+        "sciq": {
+            "acc": 0.914,
+            "acc_stderr": 0.008870325962594766,
+            "acc_norm": 0.917,
+            "acc_norm_stderr": 0.008728527206074792
+        },
+        "piqa": {
+            "acc": 0.7252448313384113,
+            "acc_stderr": 0.010415033676676042,
+            "acc_norm": 0.735038084874864,
+            "acc_norm_stderr": 0.01029655799331605
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_4.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4f5d87d305a6385e5e20f502220eda5084726ce
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_4.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.324,
+            "acc_stderr": 0.014806864733738859
+        },
+        "anli_r2": {
+            "acc": 0.346,
+            "acc_stderr": 0.01505026612756444
+        },
+        "anli_r3": {
+            "acc": 0.34,
+            "acc_stderr": 0.013680495725767792
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.3220736570490265
+        },
+        "copa": {
+            "acc": 0.67,
+            "acc_stderr": 0.04725815626252607
+        },
+        "hellaswag": {
+            "acc": 0.4078868751244772,
+            "acc_stderr": 0.004904375631128856,
+            "acc_norm": 0.5286795459071898,
+            "acc_norm_stderr": 0.004981566295189449
+        },
+        "rte": {
+            "acc": 0.47653429602888087,
+            "acc_stderr": 0.03006330041190266
+        },
+        "winogrande": {
+            "acc": 0.5516969218626677,
+            "acc_stderr": 0.013977171307126342
+        },
+        "storycloze_2016": {
+            "acc": 0.6814537680384821,
+            "acc_stderr": 0.010774165229761342
+        },
+        "boolq": {
+            "acc": 0.6051987767584098,
+            "acc_stderr": 0.008549304887647416
+        },
+        "arc_easy": {
+            "acc": 0.6060606060606061,
+            "acc_stderr": 0.01002630535598182,
+            "acc_norm": 0.5972222222222222,
+            "acc_norm_stderr": 0.010063960494989163
+        },
+        "arc_challenge": {
+            "acc": 0.2781569965870307,
+            "acc_stderr": 0.013094469919538807,
+            "acc_norm": 0.3122866894197952,
+            "acc_norm_stderr": 0.013542598541688065
+        },
+        "sciq": {
+            "acc": 0.92,
+            "acc_stderr": 0.008583336977753656,
+            "acc_norm": 0.927,
+            "acc_norm_stderr": 0.008230354715244059
+        },
+        "piqa": {
+            "acc": 0.7279651795429815,
+            "acc_stderr": 0.010382763786247381,
+            "acc_norm": 0.7317736670293797,
+            "acc_norm_stderr": 0.010336761992404485
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4f5d87d305a6385e5e20f502220eda5084726ce
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_4_lm-eval_global_step80108_2023-01-30-19-47-04_4shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.324,
+            "acc_stderr": 0.014806864733738859
+        },
+        "anli_r2": {
+            "acc": 0.346,
+            "acc_stderr": 0.01505026612756444
+        },
+        "anli_r3": {
+            "acc": 0.34,
+            "acc_stderr": 0.013680495725767792
+        },
+        "cb": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.06724777654937658,
+            "f1": 0.3220736570490265
+        },
+        "copa": {
+            "acc": 0.67,
+            "acc_stderr": 0.04725815626252607
+        },
+        "hellaswag": {
+            "acc": 0.4078868751244772,
+            "acc_stderr": 0.004904375631128856,
+            "acc_norm": 0.5286795459071898,
+            "acc_norm_stderr": 0.004981566295189449
+        },
+        "rte": {
+            "acc": 0.47653429602888087,
+            "acc_stderr": 0.03006330041190266
+        },
+        "winogrande": {
+            "acc": 0.5516969218626677,
+            "acc_stderr": 0.013977171307126342
+        },
+        "storycloze_2016": {
+            "acc": 0.6814537680384821,
+            "acc_stderr": 0.010774165229761342
+        },
+        "boolq": {
+            "acc": 0.6051987767584098,
+            "acc_stderr": 0.008549304887647416
+        },
+        "arc_easy": {
+            "acc": 0.6060606060606061,
+            "acc_stderr": 0.01002630535598182,
+            "acc_norm": 0.5972222222222222,
+            "acc_norm_stderr": 0.010063960494989163
+        },
+        "arc_challenge": {
+            "acc": 0.2781569965870307,
+            "acc_stderr": 0.013094469919538807,
+            "acc_norm": 0.3122866894197952,
+            "acc_norm_stderr": 0.013542598541688065
+        },
+        "sciq": {
+            "acc": 0.92,
+            "acc_stderr": 0.008583336977753656,
+            "acc_norm": 0.927,
+            "acc_norm_stderr": 0.008230354715244059
+        },
+        "piqa": {
+            "acc": 0.7279651795429815,
+            "acc_stderr": 0.010382763786247381,
+            "acc_norm": 0.7317736670293797,
+            "acc_norm_stderr": 0.010336761992404485
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_5.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..d04b1e5fafe1b278d1bed505c6146982e953dc77
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_5.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.348,
+            "acc_stderr": 0.015070604603768408
+        },
+        "anli_r2": {
+            "acc": 0.355,
+            "acc_stderr": 0.01513949154378053
+        },
+        "anli_r3": {
+            "acc": 0.3591666666666667,
+            "acc_stderr": 0.013855141559780354
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942398,
+            "f1": 0.3114219114219114
+        },
+        "copa": {
+            "acc": 0.69,
+            "acc_stderr": 0.04648231987117316
+        },
+        "hellaswag": {
+            "acc": 0.4095797649870544,
+            "acc_stderr": 0.00490751210312835,
+            "acc_norm": 0.5337582154949213,
+            "acc_norm_stderr": 0.004978395540514379
+        },
+        "rte": {
+            "acc": 0.49458483754512633,
+            "acc_stderr": 0.030094698123239966
+        },
+        "winogrande": {
+            "acc": 0.5603788476716653,
+            "acc_stderr": 0.01394964977601569
+        },
+        "storycloze_2016": {
+            "acc": 0.6819882415820417,
+            "acc_stderr": 0.010769343495248544
+        },
+        "boolq": {
+            "acc": 0.6162079510703364,
+            "acc_stderr": 0.008505584729104966
+        },
+        "arc_easy": {
+            "acc": 0.6077441077441077,
+            "acc_stderr": 0.010018744689650043,
+            "acc_norm": 0.6085858585858586,
+            "acc_norm_stderr": 0.010014917532627817
+        },
+        "arc_challenge": {
+            "acc": 0.27986348122866894,
+            "acc_stderr": 0.013119040897725922,
+            "acc_norm": 0.30716723549488056,
+            "acc_norm_stderr": 0.013481034054980943
+        },
+        "sciq": {
+            "acc": 0.915,
+            "acc_stderr": 0.008823426366942324,
+            "acc_norm": 0.918,
+            "acc_norm_stderr": 0.00868051561552372
+        },
+        "piqa": {
+            "acc": 0.7236126224156693,
+            "acc_stderr": 0.010434162388275615,
+            "acc_norm": 0.7328618063112078,
+            "acc_norm_stderr": 0.010323440492612423
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/evaluation/rankeval/4b284b84boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json
new file mode 100644
index 0000000000000000000000000000000000000000..d04b1e5fafe1b278d1bed505c6146982e953dc77
--- /dev/null
+++ b/4b284b84boscar/evaluation/rankeval/4b284b84boscar_5_lm-eval_global_step80108_2023-01-30-19-47-04_5shots_backup.json
@@ -0,0 +1,87 @@
+{
+    "results": {
+        "anli_r1": {
+            "acc": 0.348,
+            "acc_stderr": 0.015070604603768408
+        },
+        "anli_r2": {
+            "acc": 0.355,
+            "acc_stderr": 0.01513949154378053
+        },
+        "anli_r3": {
+            "acc": 0.3591666666666667,
+            "acc_stderr": 0.013855141559780354
+        },
+        "cb": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.06703189227942398,
+            "f1": 0.3114219114219114
+        },
+        "copa": {
+            "acc": 0.69,
+            "acc_stderr": 0.04648231987117316
+        },
+        "hellaswag": {
+            "acc": 0.4095797649870544,
+            "acc_stderr": 0.00490751210312835,
+            "acc_norm": 0.5337582154949213,
+            "acc_norm_stderr": 0.004978395540514379
+        },
+        "rte": {
+            "acc": 0.49458483754512633,
+            "acc_stderr": 0.030094698123239966
+        },
+        "winogrande": {
+            "acc": 0.5603788476716653,
+            "acc_stderr": 0.01394964977601569
+        },
+        "storycloze_2016": {
+            "acc": 0.6819882415820417,
+            "acc_stderr": 0.010769343495248544
+        },
+        "boolq": {
+            "acc": 0.6162079510703364,
+            "acc_stderr": 0.008505584729104966
+        },
+        "arc_easy": {
+            "acc": 0.6077441077441077,
+            "acc_stderr": 0.010018744689650043,
+            "acc_norm": 0.6085858585858586,
+            "acc_norm_stderr": 0.010014917532627817
+        },
+        "arc_challenge": {
+            "acc": 0.27986348122866894,
+            "acc_stderr": 0.013119040897725922,
+            "acc_norm": 0.30716723549488056,
+            "acc_norm_stderr": 0.013481034054980943
+        },
+        "sciq": {
+            "acc": 0.915,
+            "acc_stderr": 0.008823426366942324,
+            "acc_norm": 0.918,
+            "acc_norm_stderr": 0.00868051561552372
+        },
+        "piqa": {
+            "acc": 0.7236126224156693,
+            "acc_stderr": 0.010434162388275615,
+            "acc_norm": 0.7328618063112078,
+            "acc_norm_stderr": 0.010323440492612423
+        }
+    },
+    "versions": {
+        "anli_r1": 0,
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
+    }
+}
\ No newline at end of file
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bc2811c479e682b356b4f6d56aff7f69fca6f40
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5128a33b65de7ff339d8cf9c3ebfde6c69eef50b3ad7607be58b37b02b5a9a6
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1d71b1cc61496b5a79f2260fb6895625539b6a19
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6587d1ca5f41cbc89901b83bf247e652c759aab5bdec12718747275a570e62dd
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0139ec7c1a6ba3c719ba7f8b883df4459491c509
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e672f8ad7d86c3384a5c6b9f3ce4770c021f37eff4106ae1459b9cd77a6eef85
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..71daacf6fd195b36bb6e26e4f62d47600c09eecd
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d05a595a6e8d60e9fd7b5646918a39ff04156c8417cfadedc1ed8294ec4a47c1
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0a779873d12d6c5ef67f058e442094f080d6937d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d712055fe96ecfe1a56f1257a008c35b3f8d2e1b0aa5bebbff1ca99e21f04630
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c20dda98f3c27c6e98a0f757e7135b6ddd6fac4
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69671f59921c3a6cf2a92a4deac6a650cca31ea920929c0a21b93a8234b958ed
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f82bf9d799b907e937178c13d9d9bd21e91754a1
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1655a5acc482ad3e1da11cd98da6762265e6b1fdc93cffbadf436fedbae61c24
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5fcdb9598dd73a74c1be85c30647153bf7e4215
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41cc4f9e2484d2c43d76b5b1697d24d75a6960db658057c38bee6cd74fff6854
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c0f0078e9593fc2bf4f9e444f399f26e859ce6be
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a10e8ba759c4af3a361e0df1c5bce962ddee87f6c028248f391ef131c06d40d
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e2e373e799f4d0a080d5b798d5f8cfeb56ee234e
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:752b1a479b84ea6e3230bb6b23d6daaebe491885e259c0a80efa790d19e6de10
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c08d18e3d90395731b749c4b51050cf04eb1acb2
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:509aeb133a948e95077be5e4bc5c8fe56a3ea273c60064f2867ea3d0e585de47
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5f8b025da60b5777deacfb8a6b31e4b561b2720e
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46cb48281fa022c6ebf94fdf0c3ef8653e5f394553eda86691f1442632acb0a1
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7cd934713708b1145398b4ba4aa39adefdfa247c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a7b57b0d3072bd813675f541812653db45295058fb79bd20bca6c0c574d8ac9
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..333c6af8fb582efb07d574312bcf7929f2e04ad6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17f0c6263fdc058e77eeeb9300286a12ad0993f1d55fb302e5c43e99f76f0eda
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6dd13c62e58247e6e8848ca3b43d103a1793d2e5
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fadf8d6f93a41a43913d596295add47593d6b6441ec04ab6e46a3ea4efeeb6a2
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..73190577fb68c917e77a0f456d8bf72b354ad8db
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:130679f9b78b09fc27b69ea3c1cbb5a96d5dc221792b73b3212eb44b0cdf4f3b
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..47761067351150d30d4208935262458185457db8
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da645b8ba08489c904033798044eebbb3f8686b5e28901158754fe222f1771c3
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3015d3943bf561c6b6173b38730392101b9bbfa4
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac0f4152cf15571a18f3de1155b78797a3eaa95c2466224eb95d1d27677e397e
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9896569dcc6f02ece70ab3a3a5879fb557b72d20
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ff8daba775fcebca6a2064fc0785a0db922d888cc0ec435af863d1e37b07c10
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..af42466372abd3c89ae0ba7c910eed9a1869ec93
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:240490923e4304d8ac0315c87a6e6851e0fc6fd3f7d80349cc1d6c555a006613
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0547e6dd5385aaf15e6f2010d82baf6a2df1a64b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8fb696e3c44c12c3e3fef64334abaf4fa080eebeada47c8c41be952e504ae12
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5efa438e294d2140a217591be107cf7d2392c874
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c229cbd9579a123e1d3f9326241ec75cc84dd3182abf1819698a7aeac138fbf
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5753caa63fcab0cdd7d1069ac1578d7aef06fd79
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:776d101655f7c778ae5c32e315c39a3fca13f3cbd50c80536f634a550e7cc529
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..faef1a62908c489f5bfd82292efbfd03c56626f3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e913882dcae93b8f02ef52e474f202a8bd91a891ecaccfb754b960426f578c7
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6de3b4977a092f746802beb94b94cf832f758544
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a1cf847551f70d6dfd6fce393d12f1d2c76b8aa544475e2877e0794aff04feb
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa0298f00c4d2266982212972ca208d20de66352
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f92b5eb80fa5bfb4e6eaaced15f2fdedfc3f7931eef7997b1ce145e25ccaf848
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a271317d9820123f304701c5b1b0ed855bb7711a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92b5b3ef3161c2262e9c9a80689616174f378f17ad04021c4660eaef1e35354d
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22621dc5f778f242cd0b623d4f38b83d4a7e7450
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f29cfa626221424d871bc1a77441fed192f883ba95ed2a6597e511658d1ac85
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52a37fd25dd10bf0a90d79126e2722b3dbdbf921
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d033c637a8ce9dd5adc44212fd0ab83544ffc496fb2848564ebe645f7dd6391e
+size 199058605
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bc88a5847737d03787c9bca516391f0a3a3fda4
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6056253f2783a43bb9ba162c441c250e350c61a2bbbe3cdaec088d0d8b20968a
+size 199058605
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f0ccbc894a955670c88f89abc1114541683cc02
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9583a226c705583cfee764db9184fc1084dae88c85b7a47cc187499dbf643b88
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a56e126d668cf42fd8aed4bcb98f8ffb99101bc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64baaa20ebad9b7117012d0eeffec5f98f51b8c001b67a0d0b46dfc045bf24a
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cef7627f46682bf32cf3aa79927c877eebb3ce54
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb3419c150f8fc776243bae259adbfaf165ad635a01581fc363caeb6ab821b6b
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b30333369f9d44e8ad662439f7ea2235bec363df
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de77506d81ad45b95666eab8fac3949a0f56643e0ee3f6e1d1aac4e6ba2e216a
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d7c8458c3555bcef3b30e3255dcb38ebe4aef85f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0261195bcffb4ddd62f686097a7cae10e992798be7f2bc827d2bb2e59b5d3b02
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4ebaf070877baf369f3626ea89477a5217854f80
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23fddc4b6475752e9554f9429c0d1b494bd322d6f5e3ec83882a4cd7702c4eea
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..18bf35f99fb49cdbe39c1903a324ff4c791818cf
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92f3f6955bd4fe9a357ab269765c6610b80cb8e656fead6be8f7cd9e0ca91a5f
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ff9a7276d17ff7a3e38dc39a9f3e35d54c926297
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:416f2f7de0f2e8d0e1f852bdcfed2d68ec9805a23d4d75a93206efe6cb19b83c
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8086e89dd206e40336641e24201d0886bef9ae5
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38d73f1409f78d9224d1cdab35151e561d018afdbeae5423c95222327079c16
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9d84c8741e983c82ed5864c79142b18688740972
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8174232bf7bea0488766711efb943a4f9d7225ce20c2e0107ee60a84a43a7697
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca1b3f82595d50e54337971c82f47297a8221d99
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4aa298103b3f64d596a80988f5de99f32b0752fb6f957c5d578923e36bcb330
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7ecae19230785446c85b3777f5de0196599a40a1
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54ac57b28220bc1637b25bc680387eb747b83c720997d50548e2300330acc610
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..89dfb0190dfa399cf764b6b256b248577bbd889f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:298e2134848c0e2f2f08f9be120ef5697b5dfcf2497ec182012baf5d8470b5a6
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed9f9f06899ac5c3c03e74475f8d090fb4ff637a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:051f5ffa165c7205b2d0fa5fab9a153daa7f6b15b1f0924871d2b3cbaa4d982b
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2bbe18b78fe9f4af4c29b445cec9a93350b53ae6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90c8f42b48bf4aec12a07016b28f7c8260d1bf7ec6193370e4ca6d76735a1635
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aeec789d2ebc74c74fe3053692439364cb4bba8a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcf3eb073b0ffee20049ba8128a1aad950b11af947b43df4cd39b6271b2a5b10
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea632331f929d5ca455fa364702d78e001473ebc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:450232eec3942e96e6fbb2d3343ffc47f021f248562f64437926635d68113007
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8e3f54b270cfa41a05b1e5c38e9db7ef5b60b76
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6bc1ff9ff7d5afa452d7852b6ffdbd63f03433e96197bcb969fb6592d32592a
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6cbbfb81000e488483066359821bc5be5e5d65f0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87c042b3ee6de877914c6450f263bd90c30b2679b3b2ff52acade46cd39a6f55
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d2b3a6c6971cb3df4728c251e20f2cf81840bc3e
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d61040e6d860dc386a69139495f3247319638157cab5ca018705066896c3dc97
+size 199058797
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5648925cd583813538bd3b5d849ad200c37b4967
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76f85b4ec6ff499782e82232d7ccfdb92fd356d96276601ee2f885b0fa876a78
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e7e783ec7a787d1a8f3f4f9c2beea65702ffce9f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e14a94efe25e7e0590377f33c05cd120dba6b388cb0f243143a5866d6f144444
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa2321f3c64da2cf9d6e2b355347a8e0ae77e3a3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf6c3c63081c5bdceb456cd7a8a3768a40117687daa64d18d13775a266d12f95
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..079efa63219b3e2e9eccbfa6c02c38c6c75a3609
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec8fd7d5f5fe8764ea9fcb10b2474d0a70881e46c549b22c1dae53f91572d245
+size 199058733
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00f8f840778f1c4ba88f220dce5c8dd451355430
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:562038aa6317affbde6fef08098c0b53976f3aa8aaa42cfda3ddcf274b0ebc0d
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4d43a3fc3074d373c5b3e4b82bde660e9544b482
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc74ee67202b554dec0bca15f4e04e9d6ea562eb45718fdd939eda7e2ad0cde
+size 199058669
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..068ed147ce8f8e9fbeb8f5063cbd70d445899e84
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c55630500818718f87f44a2f3e4cdef4688f17efe253e86cef45fa76382b2b93
+size 199058925
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b9c42c856847e81944069fd810612ca1259ca7db
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54e76c8c741453b15ea7729011d8211b6961d40ead0febd026cb816b7307abe7
+size 199058925
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e1ba5a392b3fa2671607479e73d1816c1677b851
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee780bd5f62c8e394764e45401299cb52fb7e65218fcf7ef69a9be53137d9b15
+size 199058605
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e6d5c603998a346925b801403ba68af28c26fd6b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:339dc39839e6b434ab13a308296f6181c127d51c2194ed50ecd05f27465584f5
+size 199058605
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09e42f48f4d2edd45b02a3b4fcd6d3f95ffeb1e6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6afac96f114f7c50dbdc4fe110ecec1358cf254178c62727f9c67d11e8861f9
+size 199058605
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3c1c54c79511a512a4b30584dd3ff9c0836f415e
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a82ac6037d269f45533a2382d7ee12666bed386b5cd275ec7c62a1df7d5224a4
+size 199058605
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e961374bb75e5cd1fad22d3bb9df3b581ca23c5d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13a9f161dd08df2bebcc86c53ed4219b61de3d5dac83555136bbb4109f688d0b
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ff30e3d4cb65ba69effa96be92296aa7013c98bc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff3057368046989964d830e5913f5350816990f57205f549aebccdb0c16705a
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9cefd42b210f414c909852204b4e037685ce5114
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b10706f3dcdbb248f3d9b5ad008be22c42b74e5bd4b39f7f160e53597341f50
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..048e482646939c8ccafe68f1e856e59a12dcec1f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1089c850b53d28899fbddb2c4ab643c3c844b24db70da721f370728d802c71f9
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2df3b299c3bae6eea00e6b3a636d9c03ec1c429
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b19ae3a95de486f376e337498d5f46eca490c0ee052130800d48857ccf000c65
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0bcec815cac968e3b6d3ab39128e7ea7d8782c9a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:366f51a729b6de445e6438afa3d5239bd7bfa8c80b7faa90f158646fbc5fc12c
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..adeec86b517bd922fc12d294c83711702a4ad93b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f517de3b24895a10aa066bc499821c465c8e9f2006b3334fcdeffb4096650b40
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1bfaf974f2d8cd792dabd5ffeac4442da3cf5182
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c362e7cb7233ea931977ce4218371306ea738bea9abfad97ea4aca0d3c31ed98
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3ecc9c7a2e2fa15812a24a6f5a3c86a0d9b21c3d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdc172bc2f97f89d1c59a040910accac2ad11be08866387a10b4eff0bd6f5cb1
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..47c40c56a15b55cebb41736bd6e68115cc4f7d87
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39be3b5ceec56aea23c3dcd725951c8029e62d81856ba5988b82905a847fe658
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ade6ab23896c0d00cba6d2f3893b105daa224180
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7b9642ae0e593fcf0fcf540e04e366e26b0996095a59491e6839bf71bd4804c
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..733066c03e8f0d6351f56de6d1b162581b60e096
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49337bae25b154e6a7cd5fd24ace5d1293de7005e13617e4c80f7c36773e273c
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a75949fd014face31143abf66c2bb95ece53c744
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9beb831f248f3311949d4c498b261de2ddf1369451cdef3352c9f6beb833a679
+size 199058978
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6cf2135debda513dc15808159151d4baab6c0600
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35c28d924979bb9b38a7609b8d694bd3d5a9d20fea48a1ec9d0f94df62824236
+size 199058978
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f846527748740f15c77a10f2008308d2b042ba99
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53bd325bf7549b525e2691b62f6076c134c273eb65db800c549a474fe2e1f264
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dd812f45e5cebddf27be5589a9864c81f97c0f02
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54e06be9537b40081556da118332a38fe480ed5f3cc0c105a16959a1426ca06a
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f3eb0761c4a9592d151937f7d64d8a7ee8a5f374
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2fc5c49a1700f292927755413a3bd0b9620e3dd371ad1cf702d54564c7da5b
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..deecfd1505dcdd7a38f3e3c06da8f2f872581d31
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8dcf76a5e0937c2e89e616f87f8a15b1008895d52b0b62c249eecb7d028af7
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9bd6d783afca017ae2fa7258d84840bc54a97310
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a778d4039312974ca2ababfaef0e1775ec566e84cbc3d40c354beb8fcf1292b4
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..967e68a9b046098dfa8a9a633b403a25aaef69c2
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:352ab65de222a9abad59a171f55a24f8958fe070cdc3acdd4781709280149b46
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b88a9192ac3d550763e75cb0e3aa10a14f19ad07
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66617e66570951a67e23b5a9aadfcfab817810fc9604b1338194bc2c3bf42a0f
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3adecba7d4d44a07da6462c55587d67f220240a3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af8520e676022f2fa0f358dbeb697d613069872090e149efc41bd18c0cd82fbc
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..76c5677ae174e5d536c65a090ab1b6c96cc29e68
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:740bb52ca65485d7ceb642ddd16558864547509e2c636af8614d0052bb466c0b
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c30e1b74febab44dbd76b4a0e9aecb2aff8165c4
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2519eb84a1f7cfacb63a404689149650b7d0008676da61182beeb19ec78a353c
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..42a9585ef31d4e19df0c69c1e2f56137c651f825
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c867e13a526f275f2ea41b98ea2e60199712a35c0c70946d626193977abdcde
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cede43ed5eab0903605ca5cd993d2f6f839f8260
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:006480b531f6bfc8960028cb11bfe8269424bb07fe49d679df06779fd9583eb1
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2d3198705b33dfbf2ae058706cd4e996b80de29
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c102fc09fd72c1d229ac625a256588c0e3212e0004dfc7051b5d03a5c432a755
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a315627366a3015c8d26f774b16aabcb5a4f81ef
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07f6745265f028752587cd7ae37621b1c38bbcbfba8e122c211bfaeb8fe21680
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5e1de98a85c50285fad6f9b822aac458d9519123
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f03d1aade107e524ae38b5b1e7364a568d69ee6334a47ccf2269f4b8e3e1905
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..373a136d26fad165bb74093b718d5c184ae8d32f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ccc813ce055b9b44d7e34c203f98cf5f38103bafbc5796cfc691a44d1bc450e
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..89965ae6ec4fdf919ae2961332e06dba778b7f79
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b317c0d20b38b957e7a461e974c46b57586f3b60bca0a5e8957836849c318147
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..35a62d953f9361a00c4f228d0d0493925daa9c2e
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb2c640ceb63a4379c610bfd2765120ef6588379459e93bb9f5ad9e343e45530
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b0bd31bd636baf3b4d4ef00dfe0d6b9fee84ac7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bec1b170c3ed18a277ed58320bc9d120ef08625d27bdbe970e56157d22b6951e
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..37bc314131559b54e1d4a9fea4e90b95b7b1ed18
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:913cf56053ee742b71038b2df21330ad04dd043b2fc579b2c8042432bb6e14f9
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fd87bf5060c8e158266cee6498681f78b94cca3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11b37d0e0defc80a1d074b1b961494e679bb1c4053a8359b99b4139c1a8940f8
+size 199058594
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..251aaf063eabd629cf25ca1ea5396e5d967782e9
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6577191b3b3440b9833c606e4531a086ee7039733a59bb68a7851c9c9c8d9e3
+size 199058594
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c991e334c1d9f2308ea6ab56fe1c0cfa352a096
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cbf11a17e4080c98cb8843a78fc4568a9545c606e8b8b3158307b732848a098
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7cdcf862230512c256dbe91e9cf58a3698898165
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e8975e134d862f5a8151f5da9773be04ee5a96b29b6b2719ae2c79e58070a36
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6bf08d146b3607d46a8ed672b790793804af2d2b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b851f44643c64182bf2111717a2c63fae3e9f2ecbb8aac0502100868e7dfe85
+size 199058711
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..67693f02044c1b1a6a69cd928f30a6b4498cc210
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc3a5acb2ff1dc2e8e97874275e528985b7abfb4000beed0d726420d63217f2c
+size 199058711
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..89e39d4bcddc4a93a4c015f880cb008105d9308a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a750c019cb46bf932cd02a68455ef06635826ab4defa1df2935a6b07fad85273
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3f74a0b88fc6d719f1190981b696dd2b67dd55fa
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e56f20743c2e4efee9d4179b15855fa180fb3ed43233b2529024feed48dae3da
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25d6cb37fd401212af01eca4f130c6b7d813ce71
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dee6b3ffba7ab59e842c9162892426305e4c710e0be3a6885cd3691b35b3819b
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..02887d4378ad780a55ebb06e88d6e68b021b1b31
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f86d2058497598707fa94f63b6f01d6318199470eaddbdf6596a6d9cbd009f6
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3235c3b0af3dd4809bbf1f5dd6bfe6fd74c2b9ff
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:655385ce66fdc1c13c2922edac1c52f6f3eba18e7c93eb32857175210308695c
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21945e80cabf65f368b4b25a7a2f469ec7f76a1c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64060a9303f29217d0a66878608ff919a63af65253593916a4b245dc0bc2e423
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f0d2757319a968dbea4d5910dbfe3b762aba541
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0d856e4f3181bd2e87f7d11344e2e11110a02abc28aabdceedbfbe7150813dd
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8f4cef0ecec2bee297c918cc561e93b0c2afec5d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a33a9ee5c41f4dbe8a79a7da066f58febe4e86f55d6dc1fdf73815b5c033deaf
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0910d5cf26ff30bc4559e1d56cfb8589cb903d98
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8145ab12dfb7a578f36a591755a3a1475803d3ef8df322c47ccc14fecc9f0535
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c87131b0a3846331deca34e57812e8ee392cb4c1
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bb6db44d9671d65eecfc21d15600cac61474c6232a86a602dc79c31c643d458
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5e0bc881a94ec0a9ade3a35b01e7e762d95244da
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:539c4387b36e83b6d4defa9a5ff526c54f91498c78296db1f9867150eee0e660
+size 199058594
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c12c11bd6ef671f8a304e086c5121cb656f430f0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7cc4a69edb93241b40806aea086d5ee6d0e486588faa8c92458cf9fc0426270
+size 199058594
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ff4cc9ff94e94246a669fd8108da3e0871e63d1b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e281b788244021e65169325438035ae29f4b0eb52b252159af0008781777f2e
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2c8b05e9856530b2f685a9109c60ffef04fafffa
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb079363dd7c30271638179e83c3a4ed419e88e18c782635764a88d9a60ea72d
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2c196d89d097d4d236eef537dc5b1af287c6c0dd
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f1f34c20171157ad5fe52238b907d49c87ad1c631015251e68c424f4b366b7d
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5bf3202eaaa3bcc547684ec9cbb60b9408f82b7b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ac01e84b1f03a79004e36b8c6a4b0a94d498b214b16fd328b194c8c2dbfad26
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5e712789c21131f4a047dc8f3b0a4294df9f7e03
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d06ed7ff260a2c5d1968ef31f4e04c3c0a71a2713bef4b49466b3deca8ba7fc
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a93e79b8749b72815d64e25316dea02908a9975
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:134dfa0c1b218ca6c83aa11e2f6f86d824c55e6183c8c2c5c2c70425930a6307
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e367310632d88a9673edbb9403489c51877ee5b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e39dd30b286114cd3202006a84c1001e8a4ca87fa4a7364e176d8d28ddf8e802
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f015c85602f99ae587c0d78a69d1c44d21d0ca2
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c13114a4cd6513a9d7696d26836f036dd34e78c01587a936305b6075369b9a6d
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4c61bd94a995cb95694e93fa60bc164571ed38f6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09ad123552fcbe364cc675492aa69ef2b1d40878d5f0814cc195bc302f42bbba
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..569ee755f3f6af3c42d853d3f279c74a59f60ccf
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1da3eb1815d8345fdf020c351f9ce1adc028f47e9a61bc1ffb34fb9c0eb4f315
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c8b0ab49f138b3c27cce02685bea53f2ea4399da
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b8fa5e02df0d95e5c0aac2f8463ac20711204fa8947d9456185e783268561ba
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6d01f34be06aa4d963763fde734a47e6d4e99feb
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed18e2024e44575c8c060e57d5b3a553c0c911fa2180d554802b2b36cf765c99
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d62a8431ef69434db2327fb8831df2987ee16b90
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52a9e49ea62f5fbe8dcfd7cd3f221e62f5a9f5ac62fa04290853e660181b7894
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f6c94793c7c5faf0b7c9746ca1207d43a9df5b3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08ea5f6016b2a278ea8f72bcb4513bc1c350bf3f0e4883922550900d72ca39d8
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2974be95d7d3db778c27283f7c0852be8c4f4a6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d1d675156cbf09df144459ca3d240c90b105fb4c921843d5ceb269f822900aa
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed779c8cdcc4b6254962fc6463f403e30dbfed69
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbb2d20f88b6a6ced61b41f0ac091669719452faec73c0ce19bd65975a67075d
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d815d6c5eb478368f0e43a39706b7dc96b7f7771
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1cc3a86bc103b11181a5ad9e445ee71f337e1f9a97545bf2244c301d82ec256
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d311408064695fc5463b606541ffa54a8c685b26
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01c1ca53c541c54e6babe2a945a68aa8c43d0800f2e6eead6d71d412889ee740
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0fb6682301c237bce19a6fcbec1dd9346fbc70ef
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:419577bb179d2c1c6f8e8b870dd52857b7062f218926e1e7d25b658a6f96d295
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..076f4ea23afb8e111be3eee6993ef343ea17ce32
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f28c54dac06e3e8eaef834e0a1c53e73658f5e834e7ecab4b4f1fdc3030f831b
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b1ca2c27694c68205ad3c19120351f592956791
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5735e87a1c042f217fc4045f2cdde05d58de4c59d7bb8949cc1f573ee5d901d
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..84055d490afc0d6c9ef25963e7500306ecd831d7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34120d40bf06423483ee779dfee16d7cf4c3fcc33fb14741ead13ca0754a4bca
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1318182ad370d40d1d1867daa35dfb9fdde5d89
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f736db8024b7cbfbec4ed294a15418dcf25517563ede2a00fee5c1021be6665
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2afc0123c2c3ed13f58f19498c850b82de4bb839
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2aba2657551d8e706680edf92aa2a773744aa89fd4913e93445a453ae091c642
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..99a4ffaf949605536fcc986265b327a703f48f4f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e98ff1e1497b20fdb22a971d0f618348f2d1afe1c0de26f68f821d676ea9cc66
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..134f391f619a24e008806d2b7c0455fdfcc8f6a3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e7cb49079c793beafa30568c062d3a2d9da33f1e5f64d6f8064ea08448e7ca1
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..222abf90bba5d5ea62723e7dea06cdef95cfa5d1
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdba1ad48d4269f035ab3d81c7d53fe191c924629003d6f86f9310e2759f3966
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9db6ae36cdf46f6e09b02909c8ddbec5c245b613
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f49afea2a2c94fa56a3b31d5e658c8582839d9cd3ad58843fac524199027596
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c88f5dd07694eec6d6b2ac6391fd06b00b793ea8
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dde84d8c6a295d0705261d4088fe72f14c7818580e899effc3ba6d530bc7977
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..611736bee39a3da24c21a1eaba6173397052ebde
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:486599a7277419f235e3366b29a098e9185bba8d62075923efb1a7f78aac688e
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8d4f2fb429ae87dc2491eb2309da046f0eb5cc08
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:821361fe82f6e944e98fd36e3ffc92a1bf20d6a2104f80c0862342c8ee982955
+size 199058775
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09219efb8add74b661f19364c1963552486bcfee
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa325c44136e16b3855ec85928345950b07d75376dd85ebb2fa0a5469e5a3a85
+size 199058775
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..415ad9a390d83ad9f40b818b31963f5bddaa8ea9
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d66138b5872da330afa0aebe7361fd2509ab9107accb1576e57be0cbcc3136d7
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a6899a8f98caf701a1e9b3ebed126643c06045f7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e9d26dbfb2642eb69a4e7020c2eb46fc3a012cbcb11200185cb838249fd5a4e
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..164c8e49151b9bee44d9d5a31958642d3bb3331c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbc53837f8b60b91eadb01ea28186334763867d5a3839e3416b36e6248ebb8bd
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5a5d6efda0adda022013c43df91c0345847e773d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ad3508c7f4626fd37486505af3711381046bd773eed09e4d319b80530fde2fd
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d76f5572939890d2792bb1a5ea11b39b537706a0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e2aee32a36c006f78ff42c010c21e78d719adc83f7cc3e25cf792620c573e37
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bffa930524b0caba08691e793f49b2fb2b6420cf
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f31008a04a534fc14100bca5cc3049541c5bf7fd93b0b515b1c2b1d93ab19a76
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ed0c4c8aae11b28f872884cd5a8971ecdc97182
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74f012c7708571574e93ed817f819d7047c58441a3cf27e504eac9577ea2e661
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8d38953380dc00638e1bc06f5d34db6f77a83200
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bb7e7fcb7fa354f547aa8b01768794ec99ae16322e3fcd0d49a50732a74aef7
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..952bcc64293f016c91597bab96064f345e96dd26
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c188a3e05646d1f645c68696ec67bedfd55e5ca1bc723580c3dfb4381d6f9e00
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4af45ca447a869140e9a6e167a4b1de1ded34d6e
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5003dbcfa3477a65c2e446ce4f99fd1b41cea20f8283f091f29d563819d6785
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b0041a9516fda0b60c7024807925c581c2ece8b7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:685eda3c3561bb18c4f0f778af1a5d54d2c85c07035a6628bb5b7cb5496321d2
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0a43d47a74fefad6c66c2391944fedbd27c78e5
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce681cf1e9b690b437e5454dd6c9b563c068381301ab98cec15b25dcfb79062c
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..492444f1e9f986e22b9be8b3103a98b08957f157
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa1aa6c95c448d1a291345a558db3caeb9869971dd2a29542c31c531fd963161
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4d878983ab06084487ebc298b60156aef934007b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:122a467174a09a79359325cba67dc8aad45093fee7e7eaedfe84efaa0c23a5bc
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d43ef24a9c6bb1efa577794defe36fb5e3fd0769
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6677a56255f46fea01efe5e102202f11280146a9d94815daa52b80b12204896e
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa77a8cfe785cedc886698b86c36782c073addbf
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea7641a1397f0b6b0d6b6d5f1fb4afd69640292127d07844f06e7195fa8e6f50
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b04ffc2a70da72375eb149672b132438bc9db25b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c4a9eed9c5337cae64b31a82aa1d44c39bbabed0c61804091939bd640707eb8
+size 199058914
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..263406b26a43135ed84c3f575326b4736487b321
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:990d77cd62aca83095c1617822f817b42d4b616450c580169d7b5d404f241a67
+size 199058914
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e0e512f1e3c4aeabe506318b85e79c308bdbd848
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d85734e13da57d58e3e42b5f12716d1fb202ab44120e0ccaea65f33a6a046ab
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6c667df93963765b4c0c5763bf2cfe343c24fa46
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd33dc679353ce8bad17e6f2771896ef950731e056f9aa3aa9bfb135bfa97875
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aba1385c7f95751a488e7f795a860318f0021fb1
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e74a495a8cdcd792edb8f1801c73f536062697622e5d93766ecb87958ac49ba
+size 199058711
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49aaf372174b4af544ba5a3181c7c3c11ab36bc0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2df107f66d88212b028d6df61155320ad34dfdc0ec4792e188964cf4fd1c059e
+size 199058711
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb54a511631c859dcfcf1067528b794444ae8871
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:393097b23dbae88063920824eca246a2872d6756375cc666bfbb3a1224eb0eb2
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9bf59188d3970e5502a889fa1307ec5f62ad351a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176dc6e36e1c0647bc93236aae945e27736b6317f5d1133374d1c2b361c2602c
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b21a2345308f56873bbbca196131cc763d0649a6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48282375541d87396f9844fc750fbc68804801013c59f32c4c7e69860341dfbe
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c7bb067e9c91a02930b9012cb35ce21219de6bd
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ef0cb101267dfd419da708c1e9ed9c2cd997fc69561e91e3cf169f723b4ec53
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21895627b3ee0f4e2b52cad0e265d63770f14bdd
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c1034690fa64f66dd521d7123b3c7307855ae62ef2206369df7d10ae4ba806d
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7481d44e34ac66d41d77d3fbd02218b5b734ee42
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b9eb0cd790d15b864ad8d44adb50ff3b1367673c6f8548828d432c9d2132d32
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1abe32bce5bbba6f85c3fce36ae355f6e7ea7a58
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb03e1b837d86fe68325dc7220969d3bb891b6f568e8dfe186a1ec00b7da2fc
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0df2472de51ec77795ffecb49ac4fe56629430b7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fd34e5f42af62358966b0a441b706817e75207dae3182ed6b2812222de1eb5a
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..859d17d11ccd38580dc19b64816b9735e6670e26
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3da9cc9f9bcafbb11eace20645efd7e950622e5f128dbbf3fe4f9161b69cde4
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b76720fa02e29d0d77a1212229c8981d939f9384
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64525f110d46bdf7175bf9e1f560f642984732a076618aca5be0b95371b5c201
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d654bb1bd16cca9a8cfc6e973c573f663c47c9b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a8b7a8a0df46c8738092f1cfeb2cc20c5d4316736711f7772e88b6ab87e4285
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..66bc0e4b15e06464377ff19930d3e82b6876bca5
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:636769fc14d7559ecec5d35cdd116459da47a0393c48eb004f15228560f9202d
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69904c79646f32ce4707fc3600cf3def1b6202ed
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5201da72eeb07c0c49da0c27713d6d9fcdef3eca90802720c7c84e0bf7662f2b
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..883ed2b8c366e4325339581c0f6f0c44035f9218
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da20d5df1ff5325e920b33b0a2e1001e896119d39be865e0bb45858644801411
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00e633463bbfe2071625757569c489ed0bc49ac0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d988455a000b436c56795736f271e001a3cd3ccd6f612ac4de76c196b728daec
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aba369cb529c4f0193eef386a5392d67849e9109
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f1803f74297f9449db1ec6c3818106127bd2abf1dbfdd8a1e158e7124046773
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ddc8b400a46d78598b16859848515ac45ba0187
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b9b95cbc9a8827e44f7f52fac505e889f25a052cf41fff0c36a2f63d1956047
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09ebf8710b916463af525e4e346e6caf5afb088c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2adb94312b3a41a6da6f1a2f608965c2f18caafde1776bf422eab58c0b5b29e8
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65eb7b33b048cbf828eaa414025ab195b52153ad
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3222b05e231d3eac61f72c559ca49d153d38ca4b88c09204b45143123779960
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5fbd37c6bc685e5f7d7dae54ea846611a4a4e2f8
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42f0ec49f0a913fcfdd5893365bb9d40b3fa0603fe744c29839b448cad3f450f
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6231982e9bef39ff7aeab0703986f0a6be1d037f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51edb74fc0856ed719c40ef5e0afa8e61223f82f4c534f2b88ab58dad9bb6c1a
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0c277d37490c27fa768e140d7bc02bbceb00fd22
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e97efdcec1a68ddb3b4183ce8f3370396e22b05adec88ec33da56398bad07b24
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ff0a6904f29c5eba7e792dd9f15f20b64fa9ecc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb7dbd1271ea5e7e581f5417987bd3e9fe733f6c06b2c0a12fc9a5608f41779
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..819633cece97fdf5906ad09a91642741d8a69a68
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88b588a234adf7ba7f947d71a40e70804560e5735d0225246ae8e9fbf85ec798
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..54653df8fb7155a41e4cfa9fade76cf0e14da2aa
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26eaf970cc7e34a5cf53f72625d5ed2938648e8f0727bf48447c1fe62d842576
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2e95a58dff2de2555093ad4661dce97037947ba7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b05f0c2e3ff3f2180e74746e8b2e61a5ef6e5c8897fc066b1bfc4c8918e14e74
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..31570c79991e93b0e480e75583c1677c9dec8a39
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c7f57cb589eb731d7bf38e6f6870946e92dd49f17c38991b7869961a16ae1ac
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7d1b7890479f26a744954ba7d6d6b2a9492c4559
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26c3ef937fa2cdf8d66cb6bfd5eb2b2c3bdbb3650cc66335832d21993ff4f8eb
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7688ff5df8be9c3bc13eece0a476e8d0f8a8e992
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e10b6db8afc8bdc3046a3a277430255c5c69f70c9c3e6d3ed18bf76deff1b4a
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d435490bfc882d284cff5d4a807b984e623694b4
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15e22db8a1c47de5b1466a0818d928775c3da4f32ea4219723a75dfda0ef3b49
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d74058401b1f192dcd9317e17876c7daadcbc3e0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ce3338e251586e5ef69caaa009352116747ea44f9000a1b4060e89a2257d32b
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ce1de9a30abd884c62fde3ffd93141099a1190e8
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfc933154b53ae16b24d95f82876fab8800d2e9087ce61a3d90d7bdbb7683313
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e45e9fba04ef81fc3cbef2cb68be1e6ee7be0d96
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6ff2ec167622de8c2ea338962f7f587ef8a3a44c2885b66a8b0a63e76e4f75a
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..937c0c2c47d8fc25ecf93ff813213abb075d2618
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9a3a4bf9823dc19eead90eca7341760b455f325c938becae24e7b4eb0786e7c
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..063d46706c4bb2b9a50fd6175e0b99813cc6f9e2
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c778f9581368d835d3994c0cb866c82e576b53ce85bc86dd4c886144b6e6cdaa
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..58c160cf83215e0771a115026de5f1bee29be3be
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bc5944198d666dea872d839f9051229cd4cb1e43c203a6e8518b25bf38cdb6b
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e980144422804ceaea68f5178861f7a05c2b5480
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c4c7905360ffa242c36ce6ed32f90779818ac91c349f048044c387c6fcbc46d
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..578fd8e5697024a272aee67b9dc44633eec1ec63
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78483b43f6acb99015045e81d044d62535d36ab17ae57cdff372478275f4de8f
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2fde990e977808ea307cd0819b2d71bbafa049d7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b024e586516492bdef3ee13cd3517ec595c9e608e122caec49450deaf1300f32
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f02e4eb9f19b3acd7a8e3d2efd224f804d41d615
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:153abc55351f643d04a448fb032e40de5a6f009f373903ef64b4e067763fd2d0
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef2b78b6f61ef19d6de7267c24561018cfa2c04a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2977e3cb18688281703ae70919a6fb05f8cdac7e07adb686efbec60896a6a5b
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb748f6df0673acb3e05fde91518ade0cda19b86
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4394d6e8c3b939749ed303af3b9f37ee62ebee36f3f5336386197a3ed9f8ca31
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..53c7d99fd5b2c6dbd87acaf3da85c6944b312c5c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8daa293d269ace05e19633b659fe2d0a2a6702da2ea382a4e51f9ad3194720b7
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..815bd63499d0e85a6984ef8eb98cbd3f10adb802
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:429c648edd5a9aabb503c0e01a0c522dbebeeb306685be9c1f53e794735dc6b8
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..aacfca71e06d9e6f35a0ae00cf5135908762928c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e60c96571b17f502ba7e5db3be9383aa2939b28b3f97a9222b6b28cda5c3fc7b
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..177865f48ad257a53e4fdcd3333853a1ecb5fae4
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1acc2f5850ace4fc9bfad6b010622e88b744e3bb672406748196a3b43e409efd
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6c678ceb4eb2057a0751c90d20468b3235c599db
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3eb3a12ba87620e16079ff6fd6aa2dacb1be16fcb2c8cafcb1b8cb3c8c4e8232
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d704943a663a67876f0b2799a1b9cb9fda3954dc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffee0ec100787dc640eb976c32b5986d8493a87c28424e70f4a143dcbf9304b
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7fbf3bbb58a3a61fc1491f8e92141fb27aa2ffdc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:013f19fd5a522f22b1eb2beeb3771d482f47233de3732b3e15f1790783a523f7
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8395874be6cad066af98370ecd547e596e1b51bd
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ddf7ae189cbedbe3c9cdea99fece05e33017f610e153d6aaa9db0446bb1910a
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b09ac0d3ab7cee76d976e9451d55535d394a89b6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e2007817c9b30e82da35a859e5163503cb78827c0e20e036bfa0805eb731a48
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..20728a94df802eab587b14ee82562a9ad05dd312
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7221d67c1b04ddad2b3a143f31f8ff7c02d7a1754779992a54a239e6583efe36
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ac2c9fffd76cbf8145ed28032506314fba49a13f
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cf6794521c2a76a9535bb5fc9b76992770b75d2f6222f1b24f8c48cc91983a0
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8ca5f35dd6ee08b1b0b4fd3b35df71de5dd970a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d76e14036d7e2a060cb2f4df4abee6681e1246529d8009d6434967177d2567d4
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2635379efebbb430af6d642ffc9f55e80297fd21
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22b3d80bc4e158442f5c3745b4cf1a77eda1ea838fea4127f68656bce3a143a3
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9a8efbf015983884badfcb2c5b70b95cbbafa03c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9732b3616a78ebeb26a38640e27c0c6fada92b1e4cacc20c11356df50939f44
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fbf7307111dd81bc79e8d73b016768f805903d35
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:674d601d8be1cccdbfdb2ab03a0b646f1172aa0a5948b39a6facc147ce38bc1b
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3cad1ed4887c17136d9fdc57b75f73cc9ec785cb
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf7df490eb878dfb75979d797ae57c1c1f1f00cd5429feb49abd3826bd76771b
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ad4ed0ba5a5445e3dd6ce47aa18837714bedbac2
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fbf7ebd9226ee3037a37c14012a336bfaacad5424e336c18047f842d494e909
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..215c59446772bf418bf3f46aedacd831dc92cbf9
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54947bf29f4f04c3b317c5f2bdc4b88035ef55043003f92f9231108f0653ccad
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6965b227165f3b8254466fcf201ea74a88f0cb9d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc2a768382ce2874423fd649eed30b28e6cf3f7fd1f03823b7e3d6a25319b315
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8631daa8e2a3bb6581683ee5ab9a837d52a00a0a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5cb2ce15d8d78a40d9489287bcd1d285c1ec61ab60c828d2098a1ad4382673e
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..11ae3e7bb6d1cb688777aec3c0fb5bf0c756ae4d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b64cc0c15d6f583a0c0b920631b0c2258ea5b3d8edc534f6310b0a316503a924
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1473286a027a75acb8dabcf8735d2aca6ed7a7df
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11eb066a0620b1d2ec9ff63ce4ed5f36cab1e9a896023efd5758182add9a5cc5
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91b7334e2a19a137e2f8d72ad8263d5c589270b6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcc4b791df9bd7016bd71ad632e4f5ba522dde2611e8e55d8a95df99f1fbfbdf
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e426c744c742f9e1d32a67c327b11588af759b46
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22efb0ffb14fa51d518df138fd94fb133a167297ae41d587422a579e5d3b40e7
+size 199058647
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09893a000d6ddd331a97683ee9d0d29b1fb7c172
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:686e0f421d53c6f26c737dbe5971c40c305f0e2071aec88719f4af76464e3fcd
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9fe4afb1283be70f9a29047d37865ead8571326
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd07d1bba31c9161831f7d76c6cfe2bfaf33575f3db0962d690deaf62aa96e02
+size 199058850
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9661120d450ff6cd2f68418cc685056ba7051415
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28a3ede609ea4ab6b240327b8c84b55bdba5f27f22d3c7f4367c8c9f5dc0e8df
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01ba08d8b06e3e4f82f8a892dd8b43dba1d45552
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:741f1c17361bafc05132a27d0d0201b973d38927b29d401453023808fe5e74d3
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eea596b14b77158efd974f1a221404c78b76fc8d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31f71949a65cba0df57422ced681fa4e84f2cbed9d34cab5d56fd1659044739c
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52cba4d0cf7dc3fea3d4d09485b2d24eed636b63
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68c5408c22d9f57c7aba989bda1b9fe20aab5cc9c8d52bad802556823f5be6f3
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5cb5b9b77641426cda3611675edd648693792701
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d026ca5963ede3c1b2a25afede198542f1b6d22c0e78cadb79a33f05cd23c29
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08e6bce52339b2ad057c0e740b1d967fe6996e8b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88ebc6292d64c3edbb7cc0cfa02d4aa83ea2ad46cd2ddcbceb3f4a106a3c5372
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ee2d1832cb3ea7e6e384f98a16a5038976b19a77
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7fdf79a1927cbb58fbceee8698afc77904d056e9702eb4e20d874c3edaca22e
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bab1c70f65451c5a4d17b99f24015db0e2646216
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1fcfd8fb6424c8e15cfaaedf4abcc20214a343810315f4246e3f468ee2040f5
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..17a0dfa001c74b92c700893200c3030cc1701fc7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e57e5ae246fe97473b1812ab64921d1e7eb3c0141085e9b7521d0f7599c2825f
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5655ff8b6b351cc51948a70709ed107619f2dbb
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:796c45dd479d23b9b96f3263733e25a7d6741690d75eedc94d7338ac6214cc96
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5b90d923c7a0d30b2284c906d37e1d486f62b8e2
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25b8b26e98f2ba5b1581957573126e28219492908d685773b82405ed9d9b0db0
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7890f3d9441c77c983eb07cd999e5e7304e26d02
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e06d3c0906d0ff705d34cbeaad43e6895a8cd9d3956b727e69c71e67b1c55ba2
+size 199058722
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c794199e430c4b961e78535c11599d4fa940b0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:228d9e55dd2051257a13cf2d342dd6a179d0829ea0106ac9e2b297b7ff3f4db5
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..12a01bf7cb4c08e6960896772c390a33b4045e28
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1dae70f68ab828838123a5758d36efa15da947f53dbb0a123ad42fd1fbbcd02
+size 199058786
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4986667748a4a64b16d2b548d710f7640c065c7c
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57b64a55f23b40786d3df62c42e170d433557bd4141cf7f7250a604e4bdba022
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f357a7cc81be47891a3417b245d4d24103a83abb
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccb1d63f64f978dc84d73ca3990afa7eda98578f35dde3be0187ea5ddc8fbde1
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a76309c7223f47049ce9e42d658b412adb60a885
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b436332baa0e1851f72d14a190fdecae4341a3bbebf3a5af0e1ac18c47f2dba5
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ce6571844ebd357521b4882374b60999e9708d52
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecd4335b6b0a0153607a8cf2b032ca6e41a0a40ddc4e251fa05ee68dd08c9f5e
+size 199058658
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5b19ef7d46c476dda65545d91353b0d69c2c4dcb
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:faa43bfc582806d1bee27139246452921ab193f334b7d2fd7597686c371e0214
+size 199058839
diff --git a/4b284b84boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09eab4c38a9d9c3bfded1109dab68b78eed81e30
--- /dev/null
+++ b/4b284b84boscar/global_step80108/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd56aed614c8ed42953bea4549b712654d0fd7688f7497296f1380a2f33c32d6
+size 199058839
diff --git a/4b284b84boscar/global_step80108/layer_01-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_01-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1ed7067d4048b2a9b702f1040a65b78a50e29aa0
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_01-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e22f9495d4d4fee9a72fc3843c41b51b0635f56b707dc94876977da283bb464c
+size 167511299
diff --git a/4b284b84boscar/global_step80108/layer_01-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_01-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e819d92fdf6722ca0fb6a65feac4650eb8f93bb
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_01-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5718bf7a7d1fd7aa32e8be646362e299e904d5e9029ad6f2c5aa2f1ffc6afb02
+size 167511299
diff --git a/4b284b84boscar/global_step80108/layer_03-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_03-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..722eecf3620f974adf6be11d77e37417fcc09333
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_03-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7fe9543769cdbbafe1e0f6f31f518d528a8d063d9b058507dee364c0723369f
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_03-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_03-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d5aec2c4798225e100d7e4faa432d8bff1708ee6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_03-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c425dd275c490f426ca7ab0bbe8dab6da5fbb46dbf444d496d75c3fd7b3dc56
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_04-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_04-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8cbb5255c1dbd8b012447238002d057ae2e452c6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_04-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48663f5ceac951cfb90fd0db004053a4473a7b78dc1e896f4914bbfd2a4b9297
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_04-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_04-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f8826b3881b983666156960dd3e2dc2d4714399d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_04-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65883bad7b03c71064e209551ce5bea0d9664cf9eafb3fb8da34af8c4b07e82a
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_05-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_05-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f1335b8ece044f0220b91b356f013c8b671319af
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_05-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c56de51050a30c43f21f915e2fa10622e3e255d4dbc4d4b52a909cbcb93f4b68
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_05-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_05-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6cbed2422c2967fb049b597d99c4b502349f36a4
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_05-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c1b959476b1f95b330848eca6b31fb4e4cc8aa0c2b34560d62f94d7fcd7cb8c
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_06-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_06-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25c5918be9efe406c21c2799864781a76e75cf63
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_06-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cca1142f6f106bffbe21b4257db39b261812662df1cff49f31f52028aba7da34
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_06-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_06-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0734a2fcceb4dfa50589c453ff61e98d6441b57a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_06-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3092079e68c46eafc9450e9af743a6036df30f6064ee00e10d586a7f59b6663f
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_07-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_07-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f24d42878161398e227958a81353a26fec9a642
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_07-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5a25915630d9d65eabf4b62d5b4a4c7c4dac8e296969630d1d054dbe8522b54
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_07-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_07-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4cdf96da1d1b2f6122c357db8cf9cb8a50423bed
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_07-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba2e4d30b9fd127f98649acd5aecb9168adf64d6926639683d5df0f2b09059dd
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_08-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_08-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9838a53d2b070e8b9362782f6be9a4c9789cdb7d
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_08-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f503e82f83f3898ca3ef73efc9471f242d6ef49b61bb87b2f3432861f45d8740
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_08-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_08-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2cb920a333088fa3ada6057b139f7171e3dcfd7a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_08-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39a5ccc3742ecaf776489749ee7a8fb65c18a6437b542e9211ec1f5aafae1a37
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_09-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_09-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d7667e272292cb64fd6b58d1a7c022eb963b0420
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_09-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a393437dd3032eb0aaf77c981f72eae30218b7a0748cf22e7112fc8c3b1e996
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_09-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_09-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6ff82382a283d92cc8871906bf7836613274cf37
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_09-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05f22f9c3c6b188feb9717252aeda28150e42f41f2f6ea64dd74ea302b442b97
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_10-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_10-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0b35e1b19099d404a8d06334c280f56143378f29
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_10-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9718002d40e64a9f70cdfb35217561b01d1c0ccb03a32101344dab6f47baf95
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_10-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_10-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..694b94f433eda81919ece65e907d2c3540352945
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_10-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d30956bfb07eb3102f5a2e14b3d8e2ab45a728496009d22a8f3ecccf56c284a1
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_11-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_11-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f43415758c7d6b6187ad8e06d6589f6c710c6c45
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_11-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2469166f37a836da83a305c7a3b4e6915c02f7b16642dc322acb8dcd20df03d3
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_11-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_11-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5cfcb6590ae16fa190a6ce13ea1ad6bb735c8ab5
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_11-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf38c93b06c449ff2e6d3b8d0a54c1f44d604b09314ad565a37a36c11d6e75a3
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_12-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_12-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4a02c85d9641ed34fad61e210ad1a62ef9953fe
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_12-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f24b185a678c2b41236dcf13c44edaea29710fc20da5f5dd9aa71b4c3042f7b
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_12-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_12-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa317a9e5de4d4619ac73d429c1982288abbbd1e
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_12-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd52832278d644ca0b70ac2a8de7afd1e10e690f32a812260a2c99bb14fe4f23
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_13-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_13-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ba3e6bf28c6419201fae260a3ff1ccfca13162a1
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_13-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09683b54b36219b75a8c8ca3c3de4980e6db7eeb6eb09775d5f0a54fbe931036
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_13-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_13-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..59289c932e2857ba2965599463912d55dcf462bb
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_13-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47eb77efd7652342bfbfb9edb8535f7daa00656a51833c6567cda6b299589249
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_14-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_14-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..565102dee8448fc6ce9df47031a71ccc0c31b8da
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_14-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:000f0cf1848434e3cee7d46e8ecd4776f3c89e9b36c8cf0072d427939cf569e1
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_14-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_14-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..79dc29443a39da223c930d688c06aa737760f5b7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_14-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9783f6b8bf220589762ddd2ed960296eb00b138a7413f09b9fa05ff8eaa3dc5
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_15-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_15-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c66ef5b582ca6e7c1f7075fef958ec0bc597686b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_15-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:177c276d0f15b8d6091ceac04cb1ed611d3a32074ca21ce8794fefd6ab5044f7
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_15-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_15-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1eb026f468ff006552308a42c1844cd6db8109b3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_15-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2adeba892cdc426b60cc9ffd39b7a55cf83bd4e8fbea816c5a71c446d7ef362
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_16-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_16-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb2a51a730d1eda833d377b7e63dfd43bc369ba9
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_16-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efdcdfdcef5d3844ba8d3c74710395a4e607620e91a379222eb52c6fc7f5934e
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_16-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_16-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4a30860f3bf754eff27f8858380fdfce92ea21cd
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_16-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94cad541bdd828d43a103ef3f842476feafd62d718bfb81d35168fc0b4679465
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_17-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_17-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..027db5c27763037759320bbd44edb063ac190fba
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_17-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7998e4299e81c02fcedd30ad3c859d8bf31579c234d13584d849a321b6d48f9
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_17-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_17-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..669d9cd005137d296be3ef7da23e703e993e1583
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_17-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4832a872ec3617c9cf3fb5f7789932bac3b212ef96fd776a7e2a3659a32a3538
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_18-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_18-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..64d2cc9130661e1cd9057fb6efc1075ede661a08
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_18-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b06628777e553c7b0dc91b85f6658e600c173a987b159b65c81bb94a5a40d5a
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_18-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_18-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2d10eb73293ada53e31977ac40764863687ed225
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_18-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b00440090bf1ab90ac638e7d34992a8fe6b3b408cd60fb850de4aab70161a4f
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_19-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_19-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6796243bf42352ccc56adb3ce618c87f63b8f04b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_19-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6860bc61f79af1d796171a042da6c553fa7b151960757e144a36a6135c64a94
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_19-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_19-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e0ee6edad988b71b021c04ee06cae39da93d67d5
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_19-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd7ea050fb035b8df94f8f48f9a442d8cd2a4a08182b79849bbff4eb9fdb18a
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_20-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_20-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5b60dcf7ad198e32adde9beb4a826b00c510dffc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_20-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0449652c84b44d6c739bb52bd6ebc2b31f7f098277e88141e24894a67751ba1
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_20-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_20-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1129c06c081128c3c24bb8d19fb703fb3a54f880
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_20-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6877248f80ba5b7487962c8ec22ed440b9cd1ea190a1e7cdee6a73100dc7748e
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_21-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_21-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1ca6b4fb54a16508649545745e81ea023ae73c73
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_21-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edfb2c7efee57669e29581c31e373a0adb3623988934b9b367c8509a41bdd088
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_21-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_21-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6604f0c8a0869e54f2c2e9093cd072c96a9ef2db
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_21-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e001e6a4a0895e13c024428bc3f9c96a7a47741bef537536a2bcd67d17181b73
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_22-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_22-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0bbe8e77ab2319d8d1e6cdd0540da512b5fb9d7a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_22-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57aa3ee12665796a8ffc629ce23a93cd521e35f0f77a6da4b1f2664b25a59eb9
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_22-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_22-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d413b001957e6881bb75a51a70a03a6304710985
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_22-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf7660da17779951a86d4aad29a1e10f642fd54148295223adedfd841aff4a58
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_23-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_23-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f5dd625f01efd930fe2e058bdc9b5299e69e3781
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_23-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feec7dcd11e0015e435da0150005f88b2d7c15cf5a12c685997e3a320abffd78
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_23-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_23-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0d98eca8b38068f5b96bb23b9fc4e867540c1b72
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_23-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e18a1f3f070ecb6e82edc50c434c3c656eb4ec6010b4f717d0c8b854cbf6f3ea
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_24-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_24-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..94d8865ddc37914c8045a5e0d3416c9844bb189b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_24-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd426cf0094ec968a5d265a318a53839cdd9ff4b0775ac0d7fb5f6195bd2ddba
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_24-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_24-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4cb9673a5ce34cc5201fbb434112a3566da92281
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_24-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f86f9beccb2f751792901d3b66273c1a5b69e5a28b0d9b003c2e3ee1dc203f36
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_25-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_25-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed7d228980e7f72e77367a694695e82f1cc240ab
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_25-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3a7072dc5c620e7d719f1e57a1abb433c33767be30ffe54beda56f9fe4f4e7f
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_25-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_25-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0935c9393d4c981f3bb9ec29d4765136e9e5b712
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_25-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5caa389828fdada93848d4edc97c7d3253ab48895c5e41c242701eafec1d132
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_26-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_26-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e46f4e6d79fc87de2563b2778c52d7e42dd91f8
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_26-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c84b222fbb4bfae9ba2b663b819376ea266cdf472d13c5a4867c316c60f694be
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_26-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_26-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..63a7ff895d6bf1954025d8f02edd824861fb017b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_26-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:949969a1a0e5c5b93bbff3a1ed4108591396067d485d745203054ca928fd1b4d
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_27-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_27-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..83c9adcf3c7c6287212b9218f13c4fb5db096bc6
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_27-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f080ef12bf0e1ebe2de383ae344d7373ff8b25d089279ef3e088de67800754c
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_27-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_27-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0ffd8e3a898697675e1defd1834c3ba4823e06b7
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_27-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6da7dc3c323e2f96e588f41bf7dc070b4975a0a392fb48aaae4f3785cf67bf5e
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_28-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_28-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2280aa9af239229d0d2ebc792737b3619d3ec45
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_28-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45e0cf52c8e166cacb5348d4d1a28c83f03c4a869ade5c1027ef8240894b4038
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_28-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_28-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7f74ff2cca917cbb89b40783ad3f4529934a22f8
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_28-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b39e224974b244e607a9c2f09b884abf94a1c12a836a730d273dbba54a3f4cc9
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_29-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_29-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f4a937ace2f6ede677bb3642489ff04cd5415384
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_29-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2c4713fa1d37f40790d2f8ca43ce581e386e29fef5831d625721a6ea1524773
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_29-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_29-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4d4f217ba7c85b88127c398e74a4c48672704fed
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_29-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52ee09df8ffad17292c1245f805470fdb6d0fd915c0613de60688db00c03bf55
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_30-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_30-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..82589f108754926b505cb4d29ef245ca973f7695
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_30-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d63cc515caaba6c5691606688f8d5af582a550b95c632071e7e8e5e0600799ac
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_30-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_30-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7656c988096eb091d47d0887b8177df1661d3cd5
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_30-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e42946f4b5493383d48b828c73a05422103ee919ae4c282f2e1a1b42cab7a6e9
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_31-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_31-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..13afb87949b64492311cccc540d6f20b528dfad1
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_31-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:682ef36b79185721113c92f5459fe64f04202cd7e6f7d515dd49cb38c099a9b6
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_31-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_31-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9972e1638d20ba68171a8baf3f3fe2d812d2810b
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_31-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:762f482f7b5fbcf28e04536e98be9f28671628385ccd1c489a7286f72998450f
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_32-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_32-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e36e489cefd92709e061c3778a0303520673b1d2
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_32-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5d6141560fdd3a1e85804d43d53da3f179a02dfa10c61ef5187749bba49456b
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_32-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_32-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3bd95750ce6955f6ca7d034f5ad1b4f9fd8d6ae3
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_32-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:182dbcb801ebc5fcdb22f92847ac92ad2ccfd54416ab329eadd66620ac9b86ce
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_33-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_33-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9766c5c93c020bde2b4892b6307c54a220c752dc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_33-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bd553424e8e4af916d5f3bdd349fbcc09fd19d61c12f8b38cbc6eeb6e69d25e
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_33-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_33-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4ed4eca6617f3b834f4b18c113ff943d12d53320
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_33-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:437541419c3d6532e4ac43753ab316db409cc909da29ecd4783b0b6deb5d0eca
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_34-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_34-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01196f7e887291f5ceaf5f6969514253cd06e35a
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_34-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7ba3a8844e72334f766d31786219af8eb6082ac1394f90b8736a0bf6c1a4f70
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_34-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_34-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ccc29799a6157cc00a7a270621566a43a6330af
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_34-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08671c552efd3d57569e84e5d62d14f7099fa47acc662847e911a22730074687
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_35-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_35-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19f76de660c97a1b214c2e16c19e74684c3db5bf
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_35-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdad5ec9e6274dac33a2ad84f6561a2893e4954deaf6009fcda1b1cf4130f402
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_35-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_35-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b856b9fc58e772ec37542e5a39c5b72cd9ce5fbe
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_35-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:128bcaea1ed95a9d4d6cd04900226a49e5074c91a7f05fee642e510e11b81a6e
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_36-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_36-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..79b3a3ca16308bb079022c59c60d344da8573890
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_36-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df4d5b16626c24a39868b4817b3d728b6bae145b6fe3b89813e2d60ff50cbf8f
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_36-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_36-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ee62d3d42a16189868d2ab59d3411d3355428897
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_36-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17724afd2755e5962dbd1322cbd176f33bf6e8ce2b17e9a4feee3b8e658dcd0c
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_37-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_37-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8cbbbe5fbdf05a3347524f21d293650d9c664918
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_37-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78d2c8e43c1e2508c98c020b07d45b8f32634b3f82b053c43408d4b8dd0a1bd2
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_37-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_37-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9894bdaec1aa698c48119eb5ed2c6319436a6652
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_37-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19031eb38803d639a06c6746de7ed9adc851e057713b93f0d3e64638edc70139
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_38-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_38-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f560a5fb00037d352c9eeb775639b7d7d8ea60ef
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_38-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:915d86d39b492c0bc06f5c3726c57f5f4a108a9522f61ae6929a5a16485da23c
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_38-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_38-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c571bf5fd09948021a3cd3623ae690c5bf2cce19
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_38-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df7b27eb656f4b5b89f286e613f6d6f8e156e45836188f8c925dc4363a4080d8
+size 113308931
diff --git a/4b284b84boscar/global_step80108/layer_40-model_00-model_states.pt b/4b284b84boscar/global_step80108/layer_40-model_00-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4b0b298b346c32ca2c07f522826cc13c848cfdfc
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_40-model_00-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:499a813689e8075045ea1fa2c67a4f9f0951d183b8d792859de3c9eb5a0301b9
+size 13507
diff --git a/4b284b84boscar/global_step80108/layer_40-model_01-model_states.pt b/4b284b84boscar/global_step80108/layer_40-model_01-model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b4eeaae737b30946222a3d8264935983ec783f41
--- /dev/null
+++ b/4b284b84boscar/global_step80108/layer_40-model_01-model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c8658f1a040f7f13a1965cc9e462e95d6189184a2e321cc536dbea290a4e76e
+size 13507
diff --git a/4b284b84boscar/global_step80108/mp_rank_00_model_states.pt b/4b284b84boscar/global_step80108/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..876c9ece9c7b7bbfcc488fa35b4e035640aa6190
--- /dev/null
+++ b/4b284b84boscar/global_step80108/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:288cba65b048b43ce1d15afce2850fc246097a358b7cd3cf5baf4fb21d218fed
+size 51635
diff --git a/4b284b84boscar/global_step80108/mp_rank_01_model_states.pt b/4b284b84boscar/global_step80108/mp_rank_01_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25be8f850967bb29c6229de1d81b964c0331f085
--- /dev/null
+++ b/4b284b84boscar/global_step80108/mp_rank_01_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a75e664964b4eb6b436833aeac98d6d54723ccb0f011a05fd6718cf194807031
+size 51635
diff --git a/4b284b84boscar/transformers/config.json b/4b284b84boscar/transformers/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..df814c3a71ee49044410636b5eeb4f43611690ba
--- /dev/null
+++ b/4b284b84boscar/transformers/config.json
@@ -0,0 +1 @@
+{"vocab_size": 50432, "n_positions": 2048, "n_embd": 3072, "n_layer": 36, "n_head": 24, "n_inner": 12288, "activation_function": "gelu", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "chunk_size_feed_forward": 0, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": ["GPT2LMHeadModel"], "finetuning_task": null, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": null, "problem_type": null, "_name_or_path": "", "transformers_version": "4.25.0.dev0", "n_ctx": 1024, "gradient_checkpointing": false, "model_type": "gpt2"}
\ No newline at end of file
diff --git a/4b284b84boscar/transformers/pytorch_model.bin b/4b284b84boscar/transformers/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cc2b518d3f8e5f6c1aa85e30e74e8816be2b0770
--- /dev/null
+++ b/4b284b84boscar/transformers/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb79b929fad368ed8c13ddef6bc76172cbbf95b343122057609601d3b090ae1c
+size 8781203669
diff --git a/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674477186.nid006596.76000.0 b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674477186.nid006596.76000.0
new file mode 100644
index 0000000000000000000000000000000000000000..91149875cadc0e54932184b5262231c450e027bc
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674477186.nid006596.76000.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc1e833f077737bc7cd1e71da97f3a8a903e450c69e44f329b16840c6838f123
+size 108399363
diff --git a/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674650076.nid005381.86341.0 b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674650076.nid005381.86341.0
new file mode 100644
index 0000000000000000000000000000000000000000..9eb6b86b7eb3f2f094717c91f22aca0c40b49825
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674650076.nid005381.86341.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea90281d45a00431d693f1838073ac3be6a0167d348c9011cdbae6f5e6f07482
+size 40
diff --git a/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674652071.nid006817.94372.0 b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674652071.nid006817.94372.0
new file mode 100644
index 0000000000000000000000000000000000000000..5ea5285338ade5d66694dc975eca98190b7a1ef9
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674652071.nid006817.94372.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d29acaf7a048f9c5a8a236e27e38da8b2dcc5dc84c7ad4407436f19e2b487ab5
+size 36178846
diff --git a/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674709629.nid005245.32407.0 b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674709629.nid005245.32407.0
new file mode 100644
index 0000000000000000000000000000000000000000..937a19f09c7b8ffbcc8f17663f83ec3cc3fba931
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b12boscar/events.out.tfevents.1674709629.nid005245.32407.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6da443acd62065efe93435a159d59a6e9d4aef8e165374cb4644c3812c53c3b
+size 40
diff --git a/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674477186.nid007120.66532.0 b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674477186.nid007120.66532.0
new file mode 100644
index 0000000000000000000000000000000000000000..955d3f8754c6588e310dfd23797ad46dc7244bb1
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674477186.nid007120.66532.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e2a03711154195d485ab48991d35321d7fc94bdb695f67bf15fd454ddbb1c4d
+size 44329555
diff --git a/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674549772.nid006278.102096.0 b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674549772.nid006278.102096.0
new file mode 100644
index 0000000000000000000000000000000000000000..b2098ab774b5e8907c0ad8d7817d4a62b64aae48
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674549772.nid006278.102096.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6f21eeb8b8d7926490ff0a269a6d9e04efb91ae9b3b36f9e05354aa019111c4
+size 104246718
diff --git a/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674714525.nid006367.88512.0 b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674714525.nid006367.88512.0
new file mode 100644
index 0000000000000000000000000000000000000000..f752b2b0c2d7124d02f390dd6e5c986add2200c7
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674714525.nid006367.88512.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d0c8ab139023d7bdf7a7e12945b36f0292fe4f8e6b4ae14e39f392ad6384bb3
+size 18197406
diff --git a/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674774312.nid005245.74273.0 b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674774312.nid005245.74273.0
new file mode 100644
index 0000000000000000000000000000000000000000..8cf1a45e7e1e75b15f2097635cdfc8b72a654f5b
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674774312.nid005245.74273.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adbc362dd710a74bea031a9b3dda7bae749c73048c1718ca396e8f3f0dae04da
+size 16424
diff --git a/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674774481.nid005245.79990.0 b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674774481.nid005245.79990.0
new file mode 100644
index 0000000000000000000000000000000000000000..765b9b5ed8c70b5a946efe68b469d0cd2a875bea
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b17boscar/events.out.tfevents.1674774481.nid005245.79990.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b578274e5be2411e45c03a1a1e75b52bbaa383a22bbb2782d0d74a635dfcd7db
+size 40
diff --git a/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674472728.nid007239.75945.0 b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674472728.nid007239.75945.0
new file mode 100644
index 0000000000000000000000000000000000000000..9d049a52208286283d8e288a892129b1cf05d6a5
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674472728.nid007239.75945.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34f0d9d4c3219edc26c8e6e8e7cec68680bc887127c5079b0515cc75ec214f9a
+size 108803242
diff --git a/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674645538.nid005814.4977.0 b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674645538.nid005814.4977.0
new file mode 100644
index 0000000000000000000000000000000000000000..ef7fcc3cea4993cc270c163af64d4e4f487665a5
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674645538.nid005814.4977.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b9b0c1ee98be2396616ef4b1a10cf33b714bbed758fe310b745bde662ce9d93
+size 36178846
diff --git a/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674703090.nid005381.72092.0 b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674703090.nid005381.72092.0
new file mode 100644
index 0000000000000000000000000000000000000000..59b32fd559c873e1bd10285fca141aaf659cfc26
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674703090.nid005381.72092.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65f87681b8b4cc06cd6d2ab0fff785ab56df8884a97a195fa6a8291906962f53
+size 40
diff --git a/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674705049.nid005245.126559.0 b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674705049.nid005245.126559.0
new file mode 100644
index 0000000000000000000000000000000000000000..1ab0284dd6447649a1d51dcdca55e8fe4457829e
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b21boscar/events.out.tfevents.1674705049.nid005245.126559.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa907919a1cb26438d50a74254dca932292bdc3e4c64d4f96eeebad9ab7c7536
+size 16424
diff --git a/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674432666.nid006454.112316.0 b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674432666.nid006454.112316.0
new file mode 100644
index 0000000000000000000000000000000000000000..ca18665ab14cbb5e50e798b7bf760416d9322c4e
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674432666.nid006454.112316.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:501315089b692ff065215c55c5dfafab18e0362672c125a8db22965526dabeb7
+size 108185834
diff --git a/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674605562.nid006847.119188.0 b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674605562.nid006847.119188.0
new file mode 100644
index 0000000000000000000000000000000000000000..a53b28b98eff7f3d083d2d38b95b43c86c3bec58
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674605562.nid006847.119188.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f6b3c597d645f87789d8940316d2d42b9e9365f291902966eeb95aebc0035ec
+size 1817670
diff --git a/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674608933.nid007189.50269.0 b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674608933.nid007189.50269.0
new file mode 100644
index 0000000000000000000000000000000000000000..f25388b4d8a9217ebabce300d9e8ab5237c02549
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674608933.nid007189.50269.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40750090ea22e22dcb48ef525f189a5078172639a33d199d80bfeee50379e2c9
+size 21564328
diff --git a/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674642972.nid006576.38518.0 b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674642972.nid006576.38518.0
new file mode 100644
index 0000000000000000000000000000000000000000..89b01010c7341bcd33d5e5867f11c48529ec0fe2
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674642972.nid006576.38518.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2528a09153278716c1d0e66e3cdc28c7cae79a92780a69843f0cd1a14a6f0ce
+size 18197406
diff --git a/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674672141.nid005245.19617.0 b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674672141.nid005245.19617.0
new file mode 100644
index 0000000000000000000000000000000000000000..88dffa654edbc5313f755c219b88ffc40d1abcc4
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b28boscar/events.out.tfevents.1674672141.nid005245.19617.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01d5303eecc3d5d704f3f720346deec4d261f56a6d774b2fdfa95d48e8e05df1
+size 16424
diff --git a/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674472651.nid006937.118890.0 b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674472651.nid006937.118890.0
new file mode 100644
index 0000000000000000000000000000000000000000..e2202b5cbb3d70ebff437a743fc28a6cfb83fe75
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674472651.nid006937.118890.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ea8cc548fd15c44ace08094eb60aa85577e8440712d0e2b647b580fa10518f8
+size 108969526
diff --git a/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674645467.nid005381.48787.0 b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674645467.nid005381.48787.0
new file mode 100644
index 0000000000000000000000000000000000000000..b504ea72c60355711006a8ee14949b67716bfe4b
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674645467.nid005381.48787.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2304ecc621824858ab71feea48e335db9cc9951be08368a6858787346b2c80a6
+size 40
diff --git a/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674647460.nid005669.124109.0 b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674647460.nid005669.124109.0
new file mode 100644
index 0000000000000000000000000000000000000000..2aa25476c040dc0551ab3bd7fcd72cf8ddef1dd6
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674647460.nid005669.124109.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e3e27277992393995e950115b230fb6d5f2118a19810aaec13c5de2c5c1ba30
+size 36178846
diff --git a/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674704580.nid005245.117084.0 b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674704580.nid005245.117084.0
new file mode 100644
index 0000000000000000000000000000000000000000..89666a98bd1dd385433547d6f37d0824e14454fc
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b42boscar/events.out.tfevents.1674704580.nid005245.117084.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae158ef979389fef16dde89bdb307ead1712ba0d2f0fecc3f7bd0666a98555c5
+size 16424
diff --git a/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674431340.nid005245.50918.0 b/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674431340.nid005245.50918.0
new file mode 100644
index 0000000000000000000000000000000000000000..97929ec37ad9f47306c0ff1b4e9f3684807ed5ac
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674431340.nid005245.50918.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44f20a7421febd54544cfee5a7e03117ef71b3f715f055be5921448847335792
+size 108466325
diff --git a/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674604218.nid006541.28318.0 b/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674604218.nid006541.28318.0
new file mode 100644
index 0000000000000000000000000000000000000000..4e44d9776fce1b63fc809930aa6f1cfa380cfa7e
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674604218.nid006541.28318.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f0f0104f9ca1a3e25a3941cd4a2dfe2ca051cf4ad085c74ed0876ac44e0a944
+size 36178837
diff --git a/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674661381.nid005381.39778.0 b/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674661381.nid005381.39778.0
new file mode 100644
index 0000000000000000000000000000000000000000..e7d9dfac2ffa9ab112edc4b4ca627d99505e7af4
--- /dev/null
+++ b/tensorboard/tensorboard_4b284b84boscar/events.out.tfevents.1674661381.nid005381.39778.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:511379847cbc381d94b06c0c5e8db5c4a2f186c78ceee22c8415b7ed61370d04
+size 40